Source code for agroecometrics.data

from pathlib import Path
from typing import Optional, Tuple
from scipy.stats import circmean


import numpy as np
import pandas as pd

from agroecometrics import settings


# Gets the acutal LABELS of columns based on the user settings
LABELS = settings.get_labels()

####    UTIL FUNCTIONS    ####
[docs] def csv_file_exists(file_path: Path): """ Checks is the a Path to a csv file is valid Args: file_path: The Path to the csv file Returns: True if the csv file at the given Path already exists and False otherwise Raises: TypeError: If file_path is not a Path object. ValueError: If the file extension is not '.csv'. FileNotFoundError: If the parent directory does not exist. """ if not isinstance(file_path, Path): raise TypeError("file_path must be a pathlib.Path object.") if file_path.suffix.lower() != ".csv": raise ValueError("The filename must end with '.csv'.") if file_path.parent.exists(): return file_path.exists() else: raise FileNotFoundError(f"The directory '{file_path.parent}' does not exist.")
# Data File Functions
[docs] def load_data( file_path: Path, date_format: str = LABELS['date_format'], start_date: Optional[str] = None, end_date: Optional[str] = None, ) -> pd.DataFrame: ''' Loads data into a DataFrame from a given CSV file filtered by date and cleaned Loads the data from between the two specificed dates inclusive. If no start or end date is specified the oldest and newest dates in the data are used respectively. Adss a column with the Date normalized Adds a column with the DOY from January 1st. Ie, January 1st = 0 and December 31st = 364 Adds a column with the Year in an integer representation Args: file_path: The path to your csv data file start_date: Optional. The date to start filtering from (Inclusive) end_date: Optional. The date to stop filtering on (Inclusive) date_format: The date_format to use on the file Returns: A DataFrame with the information from the csv file filtered by the specified dates Raises: TypeError: If file_path is not a Path object. ValueError: If the file extension is not '.csv'. FileNotFoundError: If the file could not be found. ''' global LABELS # Check Parameters if not csv_file_exists(file_path): raise FileNotFoundError(f"The file, {file_path}, that you are trying to load does not exist.") if start_date and end_date and pd.to_datetime(start_date) > pd.to_datetime(end_date): raise ValueError("The end date must be after the start date.\nStart Date:\t{start_date}\nEnd Date:\t{end_date}") # Read the data from the given csv file and make the formatting consistent df = None df = pd.read_csv(file_path) df.columns = df.columns.str.strip().str.replace("'", "") df[LABELS['date_time']] = pd.to_datetime(df[LABELS['date_time']], format=date_format) # Filter data using the start and end dates if start_date: df = df[df[LABELS['date_time']] >= pd.to_datetime(start_date)] if end_date: df = df[df[LABELS['date_time']] <= pd.to_datetime(end_date)] # Adds a Year, DOY, and Normalized Date column to the df df[LABELS['doy']] = df[LABELS['date_time']].dt.dayofyear-1 df[LABELS['year']] = df[LABELS['date_time']].dt.year df[LABELS['date_norm']] = pd.to_datetime(df[LABELS['date_time']], format=date_format).dt.normalize() return df.reset_index(drop=True)
[docs] def interpolate_missing_data( df: pd.DataFrame, label_keys: Optional[list[str]] = None, method: str = "linear" ): """ Interpolates missing data within a DataFrame. Interpolates LABELS based on list of keys in label_keys or all data is label_keys is None. Args: df: The DataFrame to interpolate data on LABELS: A list of the label keys to interpolate data on. method: The pandas interpolation type to use on the data """ global LABELS if label_keys: if len(label_keys) == 0: raise ValueError("label_keys must containa list of keys or be none") for key in label_keys: if key not in LABELS: raise KeyError(key + " was not found in the available LABELS") df[LABELS[key]].interpolate(method=method, inplace=True) else: for key in df.columns.values: df[LABELS[key]].interpolate(method=method, inplace=True)
[docs] def save_data( df: pd.DataFrame, file_path: Path ) -> Path: """ Saves your DataFrame to the given csv file Args: df: The dataframe to be saved file_path: A Path to where you would like to save the data Raises: TypeError: If file_path is not a Path object. ValueError: If the file extension is not '.csv'. FileNotFoundError: If the parent directory does not exist. FileExistsError: If the file already exists and the user selects not to override it """ global LABELS # Check Parameters if csv_file_exists(file_path): print("The file {file_path} already exists.") usr_input = input("Are you sure you want OVERWRITE the file(y/n): ") if not usr_input or usr_input != "y": raise FileExistsError(f"The file, {file_path}, already exists.") # Save DataFrame df.to_csv(file_path, index=False) return file_path