Data
Feature Data

Weather Prediction Model Feature Data Processing

This document serves as a detailed guide for the Python script used to process weather data for energy simulation purposes, especially for use with EnergyPlus or other building energy simulation tools. The generated data is crucial as feature data for training indoor weather prediction models, enhancing the accuracy and efficiency of simulations.

Introduction

The script performs a series of data manipulations on weather data files, specifically EPW files, to prepare them for energy simulation and as feature data for indoor weather prediction model training. It includes functions for data cleaning, conversion, and interpolation to higher temporal resolutions.

Usage

Ensure that the EPW files are located in the specified directory, and that the output directories exist or can be created by the script.

Output

The script generates new CSV files with processed weather data suitable for use in energy simulation software. The data is interpolated to the specified time interval and includes additional computed parameters.

Constants

SIGMA = 5.6697e-8  # Stefan-Boltzmann constant
KELVIN = 273.15  # Offset for Celsius to Kelvin conversion
SECS_IN_MONTH = 86400 * 365 / 12  # Approximate number of seconds in a month

Functions

correct_hour_24

Corrects the '24:00' hour notation in a datetime string to '00:00' of the following day.

Parameters:

  • time_str: A string representing the datetime.

Returns:

  • A string with the corrected datetime.
def correct_hour_24(time_str):
    if time_str.endswith("24:00"):
        date_part, _ = time_str.split(" ")
        corrected_date = pd.to_datetime(date_part) + pd.DateOffset(days=1)
        return corrected_date.strftime("%Y-%m-%d 00:00")
    else:
        return time_str

day_light_sum

Calculates the daily sum of light based on radiation data.

Inputs:

  • time: Timestamps of radiation data in regular intervals.
  • rad: Corresponding radiation data in watts per square meter.

Output:

  • An array of daily radiation sums in megajoules per square meter.
def day_light_sum(time, rad):
    interval = (time[1] - time[0]) * 86400  # interval in time data, in seconds
    mn_before = 0  # the midnight before the current point
    mn_after = (
        np.where(np.diff(np.floor(time)) == 1)[0][0] + 1
    )  # the midnight after the current point
 
    light_sum = np.zeros(len(time))
 
    for k in range(len(time)):
        # sum from midnight before until midnight after (not including)
        light_sum[k] = np.sum(rad[mn_before:mn_after])
 
        if k == mn_after - 1:  # reached new day
            mn_before = mn_after
            new_day_indices = np.where(
                np.diff(np.floor(time[mn_before + 1:])) == 1)[0]
 
            if len(new_day_indices) > 0:
                mn_after = new_day_indices[0] + mn_before + 2
            else:
                mn_after = len(time)
 
    # convert to MJ/m2/day
    light_sum = light_sum * interval * 1e-6
 
    return light_sum

rh2vaporDens

Converts relative humidity and temperature to vapor density.

def rh2vaporDens(temp, rh):
    R = 8.3144598  # molar gas constant [J mol^{-1} K^{-1}]
    C2K = 273.15  # conversion from Celsius to Kelvin [K]
    Mw = 18.01528e-3  # molar mass of water [kg mol^-{1}]
    p = [610.78, 238.3, 17.2694, -6140.4, 273, 28.916]
    # Saturation vapor pressure of air in given temperature [Pa]
    satP = p[0] * np.exp(p[2] * temp / (temp + p[1]))
    pascals = (rh / 100) * satP  # Partial pressure of vapor in air [Pa]
    # convert to density using the ideal gas law
    vaporDens = pascals * Mw / (R * (temp + C2K))
    return vaporDens
 

vaporDens2pres

Converts vapor density back to vapor pressure.

def vaporDens2pres(temp, vaporDens):
    p = [610.78, 238.3, 17.2694, -6140.4, 273, 28.916]
    rh = vaporDens / rh2vaporDens(temp, 100)  # relative humidity [0-1]
    # Saturation vapor pressure of air in given temperature [Pa]
    satP = p[0] * np.exp(p[2] * temp / (temp + p[1]))
    vaporPres = satP * rh
    return vaporPres
 

co2ppm2dens

Converts CO2 concentration in parts per million to density in kilograms per cubic meter.

def co2ppm2dens(temp, ppm):
    R = 8.3144598  # molar gas constant [J mol^{-1} K^{-1}]
    C2K = 273.15  # conversion from Celsius to Kelvin [K]
    M_CO2 = 44.01e-3  # molar mass of CO2 [kg mol^-{1}]
    P = 101325  # pressure (assumed to be 1 atm) [Pa]
    # number of moles n = m / M_CO2 where m is the mass [kg] and M_CO2 is the
    # molar mass [kg mol^{-1}]. So m = p * V * M_CO2 * P / RT where V is 10^-6 * ppm
    co2Dens = P * 10**-6 * ppm * M_CO2 / (R * (temp + C2K))
    return co2Dens

read_epw_data

Reads an EPW file and returns a pandas DataFrame with the data.

def read_epw_data(epw_path):
    column_names = [
        "Year",
        "Month",
        "Day",
        "Hour",
        "Minute",
        "Data Source and Uncertainty Flags",
        "Dry Bulb Temperature (°C)",
        "Dew Point Temperature (°C)",
        "Relative Humidity (%)",
        "Atmospheric Station Pressure (Pa)",
        "Extraterrestrial Horizontal Radiation (J/m²)",
        "Extraterrestrial Direct Normal Radiation (J/m²)",
        "Horizontal Infrared Radiation Intensity (J/m²)",
        "Global Horizontal Radiation (J/m²)",
        "Direct Normal Radiation (J/m²)",
        "Diffuse Horizontal Radiation (J/m²)",
        "Global Horizontal Illuminance (lux)",
        "Direct Normal Illuminance (lux)",
        "Diffuse Horizontal Illuminance (lux)",
        "Zenith Luminance (klux)",
        "Wind Direction (deg)",
        "Wind Speed (m/s)",
        "Total Sky Cover (tenths)",
        "Opaque Sky Cover (tenths)",
        "Visibility (km)",
        "Ceiling Height (m)",
        "Present Weather Observation (code)",
        "Present Weather Codes (code)",
        "Precipitable Water (mm)",
        "Aerosol Optical Depth (km)",
        "Snow Depth (cm)",
        "Days Since Last Snowfall (day)",
        "Albedo",
        "Liquid Precipitation Depth (mm)",
        "Liquid Precipitation Quantity (hr)",
    ]
    epw_data = pd.read_csv(epw_path, skiprows=8, header=None)
 
    assert (
        len(column_names) == epw_data.shape[1]
    ), "Column names length doesn't match data columns number."
    epw_data.columns = column_names
 
    return epw_data
 

preprocess_epw_data

Preprocesses the EPW weather data by cleaning and reformatting timestamps.

def preprocess_epw_data(epw_data):
    epw_data["Minute"] = epw_data["Minute"].astype(str).replace("60", "0")
    epw_data["DateTime"] = (
        epw_data["Year"].astype(str)
        + "-"
        + epw_data["Month"].astype(str).str.zfill(2)
        + "-"
        + epw_data["Day"].astype(str).str.zfill(2)
        + " "
        + epw_data["Hour"].astype(str).str.zfill(2)
        + ":"
        + epw_data["Minute"].astype(str).str.zfill(2)
    )
    epw_data = epw_data.drop(
        ["Year", "Month", "Day", "Hour", "Minute"], axis=1)
    columns = ["DateTime"] + \
        [col for col in epw_data.columns if col != "DateTime"]
    epw_data = epw_data.reindex(columns=columns)
    epw_data["DateTime"] = epw_data["DateTime"].apply(correct_hour_24)
    epw_data["DateTime"] = pd.to_datetime(epw_data["DateTime"])
    return epw_data

compute_additional_data

Computes additional weather parameters like vapor density, CO2 density, and sky temperature.

def compute_additional_data(epw_data, df):
    # Define the soilTsec function
    def soilTsec(sec):
        # This function calculates the soil temperature at a given second
        return (
            s[0] * np.sin(2 * np.pi * sec / SECS_IN_MONTH /
                          per + 2 * np.pi / s[2])
            + s[3]
        )
 
    # Define the cost function
    def cost_function(b):
        # This function calculates the sum of the squared differences between the observed and predicted values
        return np.sum(
            (b[0] * np.sin(2 * np.pi * x / per + 2 * np.pi / b[2]) + b[3] - y) ** 2
        )
 
    # Get the elevation from the first row of the dataframe
    elevation = df.iloc[0, 0].split(",")[-1]
 
    # Get the global horizontal radiation, dry bulb temperature, relative humidity, and horizontal infrared radiation intensity from the epw_data
    rad = epw_data["Global Horizontal Radiation (J/m²)"]
    temp = epw_data["Dry Bulb Temperature (°C)"]
    rh = epw_data["Relative Humidity (%)"]
    radSky = epw_data["Horizontal Infrared Radiation Intensity (J/m²)"]
 
    # Calculate the vapor density, vapor pressure, and CO2 density
    vaporDens = rh2vaporDens(temp, rh)
    vapor_pressure = vaporDens2pres(temp, vaporDens)
    co2 = co2ppm2dens(temp, 410)
    # Calculate the sky temperature
    skyT = (radSky / SIGMA) ** 0.25 - KELVIN
 
    # SoilT calculations
    soilT2 = df.iloc[3, 0].split(",")[1:]
    soilT2 = soilT2[21:33]
    y = np.array(soilT2, dtype=float)
    x = np.linspace(0.5, 11.5, len(y))
 
    # Calculate the zero-crossings of the soil temperature
    yz = y - np.max(y) + (np.max(y) - np.min(y)) / 2
    shifted_yz = np.roll(yz, 1)
    crossings = np.where(yz * shifted_yz <= 0)[0]
    zx = x[crossings]
    per = 2 * np.mean(np.diff(zx))
 
    # Minimize the cost function to find the best parameters
    initial_guess = [np.max(y) - np.min(y), per, -1, np.mean(y)]
    result = minimize(cost_function, initial_guess, method="Nelder-Mead")
    s = result.x
    s[1] = per
 
    # Calculate the soil temperature for each hour of the year
    startTime = epw_data["DateTime"].iloc[0]
    secsInYear = (startTime - datetime(startTime.year,
                  1, 1, 0, 0, 0)).total_seconds()
    time_diff_seconds = np.arange(0, len(epw_data["DateTime"]) * 3600, 3600)
    soilT = soilTsec(secsInYear + time_diff_seconds)
 
    # Return the calculated values
    return vaporDens, co2, skyT, soilT, elevation
 

combine_all_data

Combines all weather parameters into a new DataFrame.

def combine_all_data(epw_data, vaporDens, co2, skyT, soilT, elevation):
 
    interval = 3600
    daily_rad_splits = np.array_split(
        epw_data["Global Horizontal Radiation (J/m²)"], len(epw_data) / 24
    )
    daily_rad_sum = [np.sum(day * interval * 1e-6) for day in daily_rad_splits]
    expanded_daily_rad_sum = np.repeat(daily_rad_sum, 24)
    expanded_daily_rad_sum_shifted = np.roll(expanded_daily_rad_sum, -1)
    expanded_daily_rad_sum_shifted[-1] = 0
 
    vapor_pressure = vaporDens2pres(
        epw_data["Dry Bulb Temperature (°C)"], vaporDens)
 
    data = pd.DataFrame(
        {
            "DateTime": epw_data["DateTime"],
            "Outdoor global irradiation(W m⁻²)": epw_data[
                "Global Horizontal Radiation (J/m²)"
            ],
            "Outdoor air temperature(°C)": epw_data["Dry Bulb Temperature (°C)"],
            "Outdoor vapor concentration(kg m⁻³)": vaporDens,
            "Outdoor CO2 concentration(kg{CO2} m⁻³{air})": co2,
            "Outdoor wind speed(m s⁻¹)": epw_data["Wind Speed (m/s)"],
            "Sky temperature(°C)": skyT,
            "temperature of external soil layer (°C)": soilT,
            "daily light sum (MJ m^{-2} day^{-1})": expanded_daily_rad_sum_shifted,
            "elevation(m)": elevation,
            "Outdoor vapor pressure(Pa)": vapor_pressure,
        }
    )
    return data
 

interpolate_to_hires

Interpolates data to a higher temporal resolution using piecewise cubic hermite interpolating polynomial.

def interpolate_to_hires(startTime, data, length, epw_data, interval=300):
    # Calculate the time difference
    length = len(epw_data["DateTime"])
 
    # Original 1-hour interval, representing the seconds from the start time to the end time, every 1 hour.
    time_diff_seconds = np.arange(0, length * 3600, 3600)
 
    # Use the pchip interpolation method to interpolate the data from a 1-hour interval to an xx-minute interval
    new_time_diff_seconds = np.arange(0, length * 3600, 300)  # xx-minute interval
    new_datetimes = [
        startTime + pd.Timedelta(seconds=int(s)) for s in new_time_diff_seconds
    ]
 
    # Create a new DataFrame to save the data at xx-minute intervals
    hires_data = pd.DataFrame({"DateTime": new_datetimes})
 
    # For each column in the data (excluding the first one), interpolate the data to the new time intervals
    for column in data.columns[1:]:
        hires_data[column] = pchip_interpolate(
            time_diff_seconds, data[column].values, new_time_diff_seconds
        )
 
    # Return the high-resolution data
    return hires_data
 

datestr_to_matlab_datenum

Converts a datetime object to MATLAB's datenum.

def datestr_to_matlab_datenum(date_obj) -> float:
    """Convert a datetime or Timestamp object to MATLAB's datenum."""
 
    # If the input is a pandas Timestamp, convert it to a datetime object
    if isinstance(date_obj, pd.Timestamp):
        date_obj = date_obj.to_pydatetime()
 
    # Calculate days from 0000/1/1 to the given date
    # Adding 366 for the year 0000 being a leap year
    days_to_date = (date_obj - datetime(1, 1, 1)).days + 366
 
    # Calculate hours in fraction of a day
    hours_fraction = (
        date_obj.hour / 24 + date_obj.minute / 1440 + date_obj.second / 86400
    )  # 86400 = 24*60*60 seconds in a day
 
    # Calculate MATLAB datenum
    matlab_datenum = days_to_date + hours_fraction + 1
 
    return matlab_datenum
 

convert_epw2csv

Main function that orchestrates the reading, processing, and saving of weather data.

def convert_epw2csv(
    roof_type,
    area,
    epw_path,
    city_name,
    time_step,
    new_epw_folder,
    window_schedule_path,
):
    # Create the new directory if it doesn't exist
    os.makedirs(new_epw_folder, exist_ok=True)
 
    # Extract the city name from the epw file path
    city_name = epw_path.split("/")[-1].split(".")[0].split("_")[1]
 
    print(
        f"Converting {epw_path} to new_epw_folder {new_epw_folder}... in {time_step} minutes interval"
    )
 
    # Read and preprocess the weather data from the epw file
    epw_data = read_epw_data(epw_path)
    epw_data = preprocess_epw_data(epw_data)
 
    # Read the epw file again to get the original data
    df = pd.read_csv(epw_path, delimiter="\t", header=None)
 
    # Compute additional weather parameters
    vaporDens, co2, skyT, soilT, elevation = compute_additional_data(
        epw_data, df)
 
    # Combine all weather parameters into one DataFrame
    data = combine_all_data(epw_data, vaporDens, co2, skyT, soilT, elevation)
 
    # Replace DateTime column in data with MATLAB datenum using datestr_to_matlab_datenum function
    data["DateTime"] = data["DateTime"].apply(datestr_to_matlab_datenum)
 
    # Save the processed data to a new csv file
    data.to_csv(
        f"{new_epw_folder}/EPW_{city_name}_{roof_type}_60_{area}.csv", index=False
    )
 
    # Generate high-resolution weather data with a time interval of 5 minutes
    hires_data = interpolate_to_hires(
        epw_data["DateTime"].iloc[0], data, len(epw_data["DateTime"]), epw_data
    )
 
    # Replace DateTime column in hires_data with MATLAB datenum using datestr_to_matlab_datenum function
    hires_data["DateTime"] = hires_data["DateTime"].apply(
        datestr_to_matlab_datenum)
 
    # Read the window schedule data
    window_schedule = pd.read_csv(
        f"./data/energyPlus/eplusout_{city_name}_{roof_type}_{time_step}_{area}.csv",
    )
 
    # Use the actual window schedule
    hires_data["Window_Schedule"] = window_schedule["window_opening_actual"]
 
    # Save the high-resolution weather data to a new csv file
    hires_data.to_csv(
        f"{new_epw_folder}/EPW_{city_name}_{roof_type}_{time_step}_{area}.csv",
        index=False,
    )

main

if __name__ == "__main__":
    roof_type = "triangle"
    area = 400
    epw_files = glob.glob("./data/epw_data/*.epw")
    convert_epw2csv(
        roof_type,
        area,
        epw_path="data/epw_data/NLD_Amsterdam.062400_IWEC.epw",
        city_name="Amsterdam",
        time_step=5,
        new_epw_folder="./data/weather_data",
        window_schedule_path="./data/greenhouse_geometry/window_schedule/window_schedule_triangle_5.csv",
    )
 

Complete Code

import os
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from scipy.optimize import minimize
from scipy.interpolate import pchip_interpolate
import glob
 
# Constants
SIGMA = 5.6697e-8
KELVIN = 273.15
SECS_IN_MONTH = 86400 * 365 / 12
 
def correct_hour_24(time_str):
    if time_str.endswith("24:00"):
        date_part, _ = time_str.split(" ")
        corrected_date = pd.to_datetime(date_part) + pd.DateOffset(days=1)
        return corrected_date.strftime("%Y-%m-%d 00:00")
    else:
        return time_str
 
def day_light_sum(time, rad):
    interval = (time[1] - time[0]) * 86400
    mn_before = 0
    mn_after = np.where(np.diff(np.floor(time)) == 1)[0][0] + 1
    light_sum = np.zeros(len(time))
 
    for k in range(len(time)):
        light_sum[k] = np.sum(rad[mn_before:mn_after])
 
        if k == mn_after - 1:
            mn_before = mn_after
            new_day_indices = np.where(np.diff(np.floor(time[mn_before + 1:])) == 1)[0]
 
            if len(new_day_indices) > 0:
                mn_after = new_day_indices[0] + mn_before + 2
            else:
                mn_after = len(time)
 
    light_sum = light_sum * interval * 1e-6
    return light_sum
 
def rh2vaporDens(temp, rh):
    R = 8.3144598
    C2K = 273.15
    Mw = 18.01528e-3
    p = [610.78, 238.3, 17.2694, -6140.4, 273, 28.916]
    satP = p[0] * np.exp(p[2] * temp / (temp + p[1]))
    pascals = (rh / 100) * satP
    vaporDens = pascals * Mw / (R * (temp + C2K))
    return vaporDens
 
def vaporDens2pres(temp, vaporDens):
    p = [610.78, 238.3, 17.2694, -6140.4, 273, 28.916]
    rh = vaporDens / rh2vaporDens(temp, 100)
    satP = p[0] * np.exp(p[2] * temp / (temp + p[1]))
    vaporPres = satP * rh
    return vaporPres
 
def co2ppm2dens(temp, ppm):
    R = 8.3144598
    C2K = 273.15
    M_CO2 = 44.01e-3
    P = 101325
    co2Dens = P * 10**-6 * ppm * M_CO2 / (R * (temp + C2K))
    return co2Dens
 
def read_epw_data(epw_path):
    column_names = [
        "Year",
        "Month",
        "Day",
        "Hour",
        "Minute",
        "Data Source and Uncertainty Flags",
        "Dry Bulb Temperature (°C)",
        "Dew Point Temperature (°C)",
        "Relative Humidity (%)",
        "Atmospheric Station Pressure (Pa)",
        "Extraterrestrial Horizontal Radiation (J/m²)",
        "Extraterrestrial Direct Normal Radiation (J/m²)",
        "Horizontal Infrared Radiation Intensity (J/m²)",
        "Global Horizontal Radiation (J/m²)",
        "Direct Normal Radiation (J/m²)",
        "Diffuse Horizontal Radiation (J/m²)",
        "Global Horizontal Illuminance (lux)",
        "Direct Normal Illuminance (lux)",
        "Diffuse Horizontal Illuminance (lux)",
        "Zenith Luminance (klux)",
        "Wind Direction (deg)",
        "Wind Speed (m/s)",
        "Total Sky Cover (tenths)",
        "Opaque Sky Cover (tenths)",
        "Visibility (km)",
        "Ceiling Height (m)",
        "Present Weather Observation (code)",
        "Present Weather Codes (code)",
        "Precipitable Water (mm)",
        "Aerosol Optical Depth (km)",
        "Snow Depth (cm)",
        "Days Since Last Snowfall (day)",
        "Albedo",
        "Liquid Precipitation Depth (mm)",
        "Liquid Precipitation Quantity (hr)",
    ]
    epw_data = pd.read_csv(epw_path, skiprows=8, header=None)
 
    assert len(column_names) == epw_data.shape[1], "Column names length doesn't match data columns number."
    epw_data.columns = column_names
    return epw_data
 
def preprocess_epw_data(epw_data):
    epw_data["Minute"] = epw_data["Minute"].astype(str).replace("60", "0")
    epw_data["DateTime"] = (
        epw_data["Year"].astype(str)
        + "-"
        + epw_data["Month"].astype(str).str.zfill(2)
        + "-"
        + epw_data["Day"].astype(str).str.zfill(2)
        + " "
        + epw_data["Hour"].astype(str).str.zfill(2)
        + ":"
        + epw_data["Minute"].astype(str).str.zfill(2)
    )
    epw_data = epw_data.drop(["Year", "Month", "Day", "Hour", "Minute"], axis=1)
    columns = ["DateTime"] + [col for col in epw_data.columns if col != "DateTime"]
    epw_data = epw_data.reindex(columns=columns)
    epw_data["DateTime"] = epw_data["DateTime"].apply(correct_hour_24)
    epw_data["DateTime"] = pd.to_datetime(epw_data["DateTime"])
    return epw_data
 
def compute_additional_data(epw_data, df):
    def soilTsec(sec):
        return (
            s[0] * np.sin(2 * np.pi * sec / SECS_IN_MONTH / per + 2 * np.pi / s[2])
            + s[3]
        )
 
    def cost_function(b):
        return np.sum(
            (b[0] * np.sin(2 * np.pi * x / per + 2 * np.pi / b[2]) + b[3] - y) ** 2
        )
 
    elevation = df.iloc[0, 0].split(",")[-1]
 
    rad = epw_data["Global Horizontal Radiation (J/m²)"]
    temp = epw_data["Dry Bulb Temperature (°C)"]
    rh = epw_data["Relative Humidity (%)"]
    radSky = epw_data["Horizontal Infrared Radiation Intensity (J/m²)"]
 
    vaporDens = rh2vaporDens(temp, rh)
    vapor_pressure = vaporDens2pres(temp, vaporDens)
    co2 = co2ppm2dens(temp, 410)
    skyT = (radSky / SIGMA) ** 0.25 - KELVIN
 
    soilT2 = df.iloc[3, 0].split(",")[1:]
    soilT2 = soilT2[21:33]
    y = np.array(soilT2, dtype=float)
    x = np.linspace(0.5, 11.5, len(y))
 
    yz = y - np.max(y) + (np.max(y) - np.min(y)) / 2
    shifted_yz = np.roll(yz, 1)
    crossings = np.where(yz * shifted_yz <= 0)[0]
    zx = x[crossings]
    per = 2 * np.mean(np.diff(zx))
 
    initial_guess = [np.max(y) - np.min(y), per, -1, np.mean(y)]
    result = minimize(cost_function, initial_guess, method="Nelder-Mead")
    s = result.x
    s[1] = per
 
    startTime = epw_data["DateTime"].iloc[0]
    secsInYear = (startTime - datetime(startTime.year, 1, 1, 0, 0, 0)).total_seconds()
    time_diff_seconds = np.arange(0, len(epw_data["DateTime"]) * 3600, 3600)
    soilT = soilTsec(secsInYear + time_diff_seconds)
 
    return vaporDens, co2, skyT, soilT, elevation
 
def combine_all_data(epw_data, vaporDens, co2, skyT, soilT, elevation):
    interval = 3600
    daily_rad_splits = np.array_split(
        epw_data["Global Horizontal Radiation (J/m²)"], len(epw_data) / 24
    )
    daily_rad_sum = [np.sum(day * interval * 1e-6) for day in daily_rad_splits]
    expanded_daily_rad_sum = np.repeat(daily_rad_sum, 24)
    expanded_daily_rad_sum_shifted = np.roll(expanded_daily_rad_sum, -1)
    expanded_daily_rad_sum_shifted[-1] = 0
 
    vapor_pressure = vaporDens2pres(epw_data["Dry Bulb Temperature (°C)"], vaporDens)
 
    data = pd.DataFrame(
        {
            "DateTime": epw_data["DateTime"],
            "Outdoor global irradiation(W m⁻²)": epw_data[
                "Global Horizontal Radiation (J/m²)"
            ],
            "Outdoor air temperature(°C)": epw_data["Dry Bulb Temperature (°C)"],
            "Outdoor vapor concentration(kg m⁻³)": vaporDens,
            "Outdoor CO2 concentration(kg{CO2} m⁻³{air})": co2,
            "Outdoor wind speed(m s⁻¹)": epw_data["Wind Speed (m/s)"],
            "Sky temperature(°C)": skyT,
            "temperature of external soil layer (°C)": soilT,
            "daily light sum (MJ m^{-2} day^{-1})": expanded_daily_rad_sum_shifted,
            "elevation(m)": elevation,
            "Outdoor vapor pressure(Pa)": vapor_pressure,
        }
    )
    return data
 
def interpolate_to_hires(startTime, data, length, epw_data, interval=300):
    length = len(epw_data["DateTime"])
    time_diff_seconds = np.arange(0, length * 3600, 3600)
    new_time_diff_seconds = np.arange(0, length * 3600, 300)
    new_datetimes = [
        startTime + pd.Timedelta(seconds=int(s)) for s in new_time_diff_seconds
    ]
    hires_data = pd.DataFrame({"DateTime": new_datetimes})
 
    for column in data.columns[1:]:
        hires_data[column] = pchip_interpolate(
            time_diff_seconds, data[column].values, new_time_diff_seconds
        )
 
    return hires_data
 
def datestr_to_matlab_datenum(date_obj) -> float:
    if isinstance(date_obj, pd.Timestamp):
        date_obj = date_obj.to_pydatetime()
 
    days_to_date = (date_obj - datetime(1, 1, 1)).days + 366
    hours_fraction = (
        date_obj.hour / 24 + date_obj.minute / 1440 + date_obj.second / 86400
    )
    matlab_datenum = days_to_date + hours_fraction + 1
    return matlab_datenum
 
def convert_epw2csv(
    roof_type,
    area,
    epw_path,
    city_name,
    time_step,
    new_epw_folder,
    window_schedule_path,
):
    os.makedirs(new_epw_folder, exist_ok=True)
    city_name = epw_path.split("/")[-1].split(".")[0].split("_")[1]
 
    print(
        f"Converting {epw_path} to new_epw_folder {new_epw_folder}... in {time_step} minutes interval"
    )
    epw_data = read_epw_data(epw_path)
    epw_data = preprocess_epw_data(epw_data)
    df = pd.read_csv(epw_path, delimiter="\t", header=None)
 
    vaporDens, co2, skyT, soilT, elevation = compute_additional_data(epw_data, df)
 
    data = combine_all_data(epw_data, vaporDens, co2, skyT, soilT, elevation)
    data["DateTime"] = data["DateTime"].apply(datestr_to_matlab_datenum)
 
    data.to_csv(
        f"{new_epw_folder}/EPW_{city_name}_{roof_type}_60_{area}.csv", index=False
    )
 
    hires_data = interpolate_to_hires(
        epw_data["DateTime"].iloc[0], data, len(epw_data["DateTime"]), epw_data
    )
 
    hires_data["DateTime"] = hires_data["DateTime"].apply(datestr_to_matlab_datenum)
 
    window_schedule = pd.read_csv(
        f"./data/energyPlus/eplusout_{city_name}_{roof_type}_{time_step}_{area}.csv",
    )
 
    hires_data["Window_Schedule"] = window_schedule["window_opening_actual"]
 
    hires_data.to_csv(
        f"{new_epw_folder}/EPW_{city_name}_{roof_type}_{time_step}_{area}.csv",
        index=False,
    )
 
if __name__ == "__main__":
    roof_type = "triangle"
    area = 400
    epw_files = glob.glob("./data/epw_data/*.epw")
 
    for epw_path in epw_files[:1]:
        convert_epw2csv(
            roof_type,
            area,
            epw_path="data/epw_data/NLD_Amsterdam.062400_IWEC.epw",
            city_name="Amsterdam",
            time_step=5,
            new_epw_folder="./data/weather_data",
            window_schedule_path="./data/greenhouse_geometry/window_schedule/window_schedule_triangle_5.csv",
        )