frompathlibimportPathfromtypingimportOptional,Tuplefromscipy.statsimportcircmeanimportnumpyasnpimportpandasaspd#### Private UTIL FUNCTIONS ####def__csv_file_exists(file_path:Path)->bool:""" Checks if a Path is a valid path to an existing .CSV file. Args: file_path: The Path to the file being checked. Returns: True if the csv file at the given Path already exists and False otherwise. Raises: TypeError: If file_path is not a Path object. ValueError: If the file is not a csv file. FileNotFoundError: If the parent directory of the file does not exist. """# Check Parameters for validityifnotisinstance(file_path,Path):raiseTypeError("file_path must be a pathlib.Path object.")iffile_path.suffix.lower()!=".csv":raiseValueError("The filename must end with '.csv'.")ifnotfile_path.parent.exists():raiseFileNotFoundError(f"The directory '{file_path.parent}' does not exist.")# Return if the file path existsreturnfile_path.exists()# Data File Functions
[docs]defload_data_csv(file_path:Path,date_time_column:str,date_time_format:str="%m/%d/%Y %I:%M %p",start_date:Optional[str]=None,end_date:Optional[str]=None,)->pd.DataFrame:''' Loads data from a csv file into a DataFrame while filtering and cleaning the data. Loads data from a csv file into a DataFrame. Strips the column names of whitespace on either end and removes apostrophes. Filters data by the given dates. If no start or end date is specified the oldest and newest dates in the data are used respectively. Converts the date time column to Date Time objects and numeric columns to Numeric Data Types Adds columns for the date_norm, DOY, and Year. - Column names can be found by running AEM.settings.calc_calculation_labels(). - date_norm stores the date normalized to contain the same time, midnight. - doy stores the number of days since January 1st, where January 1st = 0 and December 31st = 364 or 365 during a leap year. - year stores the current year. Args: file_path: The path to your csv data file. start_date: Optional. The date to start filtering from (Inclusive). end_date: Optional. The date to stop filtering on (Inclusive). date_format: The date_format to use on the file. Returns: A DataFrame with the information from the csv file filtered by the specified dates. Raises: TypeError: If file_path is not a Path object. ValueError: If the file extension is not '.csv'. FileNotFoundError: If the file could not be found. '''# Check Parametersifnot__csv_file_exists(file_path):raiseFileNotFoundError(f"The file, {file_path}, that you are trying to load does not exist.")ifstart_dateandend_dateandpd.to_datetime(start_date)>pd.to_datetime(end_date):raiseValueError("The end date must be after the start date.\nStart Date:\t{start_date}\nEnd Date:\t{end_date}")# Read the data from the given csv file and make the formatting consistentdf=Nonedf=pd.read_csv(file_path)df.columns=df.columns.str.strip().str.replace("'","")ifdate_time_columnnotindf.columns:raiseValueError(f"The given date column name, {date_time_column}, was not found in the csv data!")df[date_time_column]=pd.to_datetime(df[date_time_column],format=date_time_format)# Filter data using the start and end datesifstart_date:df=df[df[date_time_column]>=pd.to_datetime(start_date)]ifend_date:df=df[df[date_time_column]<=pd.to_datetime(end_date)]# Adds a Year, DOY, and Normalized Date column to the dfdf['DOY']=df[date_time_column].dt.dayofyear-1df['YEAR']=df[date_time_column].dt.yeardf['NORMALIZED_DATE']=pd.to_datetime(df[date_time_column],format=date_time_format).dt.normalize()returndf.reset_index(drop=True)
[docs]definterpolate_df(df:pd.DataFrame,col_names:Optional[list[str]]=None,method:str="linear")->pd.DataFrame:""" Interpolates missing data within a DataFrame. Interpolates data using dataframes interpolate function. If a list of column names are provided only those columns are interpolated. Args: df: The DataFrame to interpolate col_names: An optional list of columns names to interpolate the data for method: The dataframe interpolation method to use Returns: The DataFrame. Raises: KeyError: If one of the col_names was not found in the dataframe """ifcol_names:iflen(col_names)==0:raiseValueError("label_keys must containa list of keys or be none")forcolincol_names:ifcolnotindf.columns:raiseKeyError(col+" was not found in the df")df[col]=df[col].interpolate(method=method)else:forcolindf.columns.values:df[col]=df[col].interpolate(method=method)returndf
[docs]defsave_data_csv(df:pd.DataFrame,file_path:Path)->Path:""" Saves your DataFrame to the given csv file. Args: df: The dataframe to be saved. file_path: A Path to where you would like to save the data. Returns: The Path to the newly saved file. Raises: TypeError: If file_path is not a Path object. ValueError: If the file extension is not '.csv'. FileNotFoundError: If the parent directory does not exist. FileExistsError: If the file already exists. """# Check Parametersif__csv_file_exists(file_path):print(f"The file {file_path} already exists.")usr_input=input("Are you sure you want OVERWRITE the file(y/n): ")ifnotusr_inputorusr_input!="y":raiseFileExistsError(f"The file, {file_path}, already exists.")# Save DataFramedf.to_csv(file_path,index=False)returnfile_path
# Fletcher's Functions
[docs]defmatch_datetimes(target_dt:np.ndarray,desired_dt:np.ndarray)->Tuple[np.ndarray,np.ndarray,np.ndarray,np.ndarray]:""" Matches each datetime in 'desired_dt' to the closest datetime in 'target_dt'. Args: target_dt: The np array of actual weather date_times desired_dt: The np array of date_times to find the closest match for Returns: A tuple of 4 numpy arrays The first contains the original desired_dt The second contains the date_times matched from the target_dt. The third contains the indices of the matched date_times in the original target_dt The fourth contains the the difference in time between the desired and matched times """idx=np.searchsorted(target_dt,desired_dt,side="left")idx=np.minimum(idx,len(target_dt)-1)foriinrange(len(idx)):ifidx[i]>0:before=target_dt[idx[i]-1]after=target_dt[idx[i]]ifabs(desired_dt[i]-before)<abs(desired_dt[i]-after):idx[i]-=1matched_times=target_dt[idx]diffs=np.abs(desired_dt-matched_times)returndesired_dt,matched_times,idx,diffs
[docs]defdf_as_dict(df:pd.DataFrame,cols:Optional[np.ndarray],idx:Optional[np.ndarray])->dict[str,list[float]]:""" Extracts values from a DataFrame into a dictionary Allows users to create a dictionary from a Dataframe. Column names are used for the dictionary keys. Only the columns the user provides are added to the dictionary. If no columns are provided all columns are added. The value stored in each key is a list of data from the given DataFrame column Only the data for Args: df: The DataFrame containing weather data cols: The key names to be used in the dictionary idx: The indices in the DataFrame to be used in the dictionary Returns: A dictionary of {column_name: values}. """# Get all Columns if None are specifiedifcolsisNoneorlen(cols)==0:cols=np.asarray(df.columns)# Get all indices if None are specifiedifidxisNoneorlen(idx)==0:idx=np.asarray([iforiinrange(df.shape[0])])return_dict={}forcolincols:ifcolindf.columns:return_dict[col]=df[col].iloc[idx].tolist()returnreturn_dict