git_brcal/src/server/BreCal/database/sql_handler.py

import numpy as np
import pandas as pd
import pydapper
import datetime
import typing
from BreCal.schemas.model import Shipcall, Ship, Participant, Berth, User, Times, ShipcallParticipantMap
from BreCal.database.enums import ParticipantType
from BreCal.local_db import getPoolConnection
from BreCal.database.sql_queries import SQLQuery
from BreCal.schemas import model


def pandas_series_to_data_model():
    return

def set_participant_type(x, participant_df)->int:
    """
    when iterating over each row entry x in the shipcall_participant_map,
    one can update the 'type' column by extracting the matching data from a participant dataframe

    returns: participant_type
    """
    participant_id = x["participant_id"]
    participant_type = participant_df.loc[participant_id, "type"]
    return participant_type

def get_synchronous_shipcall_times_standalone(query_time:pd.Timestamp, all_df_times:pd.DataFrame, delta_threshold=900)->int:
    """
    This function counts all entries in {all_df_times}, which have the same timestamp as {query_time}.
    It does so by:
    1.) selecting all eta_berth & etd_berth entries
    2.) measuring the timedelta towards {query_time}
    3.) converting the timedelta to total absolute seconds (positive or negative time differences do not matter)
    4.) applying a {delta_threshold} to identify, whether two times are too closely together
    5.) counting the times, where the timedelta is below the threshold

    returns: counts
    """
    assert (isinstance(query_time,pd.Timestamp)) or (pd.isnull(query_time)), f"expected a timestamp. Found type: {type(query_time)} with value: {query_time}"
    if pd.isnull(query_time):
        return 0

    # get a timedelta for each valid (not Null) time entry
    time_deltas_eta = [(query_time.to_pydatetime()-time_.to_pydatetime()) for time_ in all_df_times.loc[:,"eta_berth"] if not pd.isnull(time_)]
    time_deltas_etd = [(query_time.to_pydatetime()-time_.to_pydatetime()) for time_ in all_df_times.loc[:,"etd_berth"] if not pd.isnull(time_)]

    # consider both, eta and etd times
    time_deltas = time_deltas_eta + time_deltas_etd

    # convert the timedelta to absolute total seconds
    time_deltas = [abs(delta.total_seconds()) for delta in time_deltas]

    # consider only those time deltas, which are <= the determined threshold
    # create a list of booleans
    time_deltas_filtered = [delta <= delta_threshold for delta in time_deltas]

    # booleans can be added/counted in Python by using sum()
    counts = sum(time_deltas_filtered) # int
    return counts

def execute_sql_query_standalone(query, param={}, pooledConnection=None, model=None, command_type="query"):
    """
    execute an arbitrary query with a set of parameters, return the output and convert it to a list.
    when the pooled connection is rebuilt, it will be closed at the end of the function.
    """
    rebuild_pooled_connection = pooledConnection is None

    if rebuild_pooled_connection:
        pooledConnection = getPoolConnection()

    commands = pydapper.using(pooledConnection)
    # participant_query = "SELECT participant_id, type FROM shipcall_participant_map WHERE shipcall_id=?shipcall_id?";

    # creates a generator
    try:
        if command_type=="query":
            if model is None:
                schemas = commands.query(query, model=dict, param=param, buffered=False)
            else:
                schemas = commands.query(query, model=model, param=param, buffered=False)

            # creates a list of results from the generator
            schemas = [schema for schema in schemas]

        elif command_type=="execute":
            schemas = commands.execute(query, param=param)
        elif command_type=="single":
            sentinel = object()

            # pulls a *single* row from the query. Typically, these queries require an ID within the param dictionary.
            # when providing a model, such as model.Shipcall, the dataset is immediately translated into a data model.
            schemas = commands.query_single_or_default(query, sentinel, param=param) if model is None else commands.query_single_or_default(query, sentinel, param=param, model=model)
            if schemas is sentinel:
                raise Exception("no such record")
        elif command_type=="single_or_none":
            sentinel = object()

            # pulls a *single* row from the query. Typically, these queries require an ID within the param dictionary.
            # when providing a model, such as model.Shipcall, the dataset is immediately translated into a data model.
            schemas = commands.query_single_or_default(query, sentinel, param=param) if model is None else commands.query_single_or_default(query, sentinel, param=param, model=model)
            schemas = None if schemas is sentinel else schemas

        elif command_type=="execute_scalar":
            schemas = commands.execute_scalar(query)

        else:
            raise ValueError(command_type)

    finally: # if needed, ensure that the pooled connection is closed.
        if rebuild_pooled_connection:
            pooledConnection.close()
    return schemas

def get_assigned_participant_of_type(shipcall_id:int, participant_type:typing.Union[int,model.ParticipantType])->typing.Optional[model.Participant]:
    """obtains the ShipcallParticipantMap of a given shipcall and finds the participant id of a desired type. Finally, returns the respective Participant"""
    spm_shipcall_data = execute_sql_query_standalone(
        query=SQLQuery.get_shipcall_participant_map_by_shipcall_id_and_type(),
        param={"id":shipcall_id, "type":participant_type},
        command_type="query") # returns a list of matches

    if len(spm_shipcall_data)==0:
        return None

    query = 'SELECT * FROM participant WHERE id=?participant_id?'
    assigned_participant = execute_sql_query_standalone(
        query=query,
        param={"participant_id":spm_shipcall_data[0]["participant_id"]},
        model=model.Participant,
        command_type="single_or_none"
        ) # returns a list of matches
    return assigned_participant


class SQLHandler():
    """
    An object that reads SQL queries from the sql_connection and stores it in pandas DataFrames. The object can read all available tables
    at once into memory, when providing 'read_all=True'.

    # #TODO_initialization: shipcall_tug_map, user_role_map & role_securable_map might be mapped to the respective dataframes
    """
    def __init__(self, sql_connection, read_all=False):
        self.sql_connection = sql_connection
        self.all_schemas = self.get_all_schemas_from_mysql()
        self.build_str_to_model_dict()

        if read_all:
            self.read_all(self.all_schemas)

    def execute_sql_query(self, sql_connection, query, param):
        """
        this method is best used in combination with a python context-manager, such as:
        with mysql.connector.connect(**mysql_connection_data) as sql_connection:
            schema = sql_handler.execute_sql_query(sql_connection, query)
        """
        schemas = execute_sql_query_standalone(query, param, pooledConnection=sql_connection)
        return schemas

    def get_all_schemas_from_mysql(self):
        with self.sql_connection.cursor(buffered=True) as cursor:
            cursor.execute("SHOW TABLES")
            schema = cursor.fetchall()
            all_schemas = [schem[0] for schem in schema]
        return all_schemas

    def build_str_to_model_dict(self):
        """
        creates a simple dictionary, which maps a string to a data object
        e.g.,
            'ship'->BreCal.schemas.model.Ship object
        """
        self.str_to_model_dict = {
            "shipcall":Shipcall, "ship":Ship, "participant":Participant, "berth":Berth, "user":User, "times":Times,
            "shipcall_participant_map":ShipcallParticipantMap
        }
        return

    def read_mysql_table_to_df(self, table_name:str):
        """determine a {table_name}, which will be read from a mysql server. returns a pandas DataFrame with the respective data"""
        with self.sql_connection.cursor(buffered=True) as cursor: #df = pd.read_sql(sql=f"SELECT * FROM {table_name}", con=self.sql_connection)
            # 1.) get the column names
            cursor.execute(f"DESCRIBE {table_name}")
            cols = cursor.fetchall()
            column_names = [col_name[0] for col_name in cols]

            # 2.) get the data tuples
            cursor.execute(f"SELECT * FROM {table_name}")
            data = cursor.fetchall()

        # 3.) map the data tuples to the correct column names
        data = [{k:v for k,v in zip(column_names, dat)} for dat in data]

        # 4.) build a dataframe from the respective data models (which ensures the correct data type)
        df = self.build_df_from_data_and_name(data, table_name)
        return df

    def build_df_from_data_and_name(self, data, table_name):
        data_model = self.str_to_model_dict.get(table_name)
        if data_model is not None:
            df = pd.DataFrame([data_model(**dat) for dat in data], columns=list(data_model.__annotations__.keys()))
        else:
            df = pd.DataFrame([dat for dat in data])
        return df

    def mysql_to_df(self, query, table_name):
        """provide an arbitrary sql query that should be read from a mysql server {sql_connection}. returns a pandas DataFrame with the obtained data"""
        with self.sql_connection.cursor(buffered=True) as cursor: # df = pd.read_sql(query, self.sql_connection).convert_dtypes()
            # 1.) get the column names
            cursor.execute(f"DESCRIBE {table_name}")
            cols = cursor.fetchall()
            column_names = [col_name[0] for col_name in cols]

            # 2.) get the data tuples
            cursor.execute(query)
            data = cursor.fetchall()

        # 3.) map the data tuples to the correct column names
        data = [{k:v for k,v in zip(column_names, dat)} for dat in data]

        # 4.) build a dataframe from the respective data models (which ensures the correct data type)
        data_model = self.str_to_model_dict.get(table_name)
        df = self.build_df_from_data_and_name(data, table_name)
        if 'id' in df.columns:
            df = df.set_index('id', inplace=False) # avoid inplace updates, so the raw sql remains unchanged
        return df

    def read_all(self, all_schemas):
        # create a dictionary, which maps every mysql schema to pandas DataFrames
        self.df_dict = self.build_full_mysql_df_dict(all_schemas)

        # update the 'participants' column in 'shipcall'
        self.initialize_shipcall_participant_list()

        # update the 'type' in shipcall_participants_map
        # fully deprecated
        # self.add_participant_type_to_map()
        return

    def build_full_mysql_df_dict(self, all_schemas):
        """given a list of strings {all_schemas}, every schema will be read as individual pandas DataFrames to a dictionary with the respective keys. returns: dictionary {schema_name:pd.DataFrame}"""
        mysql_df_dict = {}
        for schem in all_schemas:
            query = f"SELECT * FROM {schem}"
            mysql_df_dict[schem] = self.mysql_to_df(query, table_name=schem)
        return mysql_df_dict

    def initialize_shipcall_participant_list(self):
        """
        iteratively applies the .get_participants method to each shipcall.
        the function updates the 'participants' column.
        """
        # 1.) get all shipcalls
        df = self.df_dict.get('shipcall')

        # 2.) iterate over each individual shipcall, obtain the id (pandas calls it 'name')
        # and apply the 'get_participants' method, which returns a list
        # if the shipcall_id exists, the list contains ids
        # otherwise, return a blank list
        df['participants'] = df.apply(
            lambda x: self.get_participants(x.name),
            axis=1)
        return

    def add_participant_type_to_map(self):
        """
        applies a lambda function, where the 'type'-column in the shipcall_participant_map is updated by reading the
        respective data from the participants. Updates the shipcall_participant_map inplace.
        """
        raise Exception("deprecated! Overwriting the shipcall_participant_map may cause harm, as a participant with multi-flag might be wrongfully assigned to multiple roles simultaneously.")
        #spm = self.df_dict["shipcall_participant_map"]
        #participant_df = self.df_dict["participant"]

        #spm.loc[:,"type"] = spm.loc[:].apply(lambda x: set_participant_type(x, participant_df=participant_df),axis=1)
        #self.df_dict["shipcall_participant_map"] = spm
        return

    def get_assigned_participants(self, shipcall)->pd.DataFrame:
        """return each participant of a respective shipcall, filtered by the shipcall id"""
        # get the shipcall_participant_map
        spm = self.df_dict["shipcall_participant_map"]
        assigned_participants = spm.loc[spm["shipcall_id"]==shipcall.id]
        return assigned_participants

    def get_assigned_participants_by_type(self, assigned_participants:pd.DataFrame, participant_type:ParticipantType):
        """filters a dataframe of assigned_participants by the provided type enumerator"""
        if isinstance(participant_type,int):
            participant_type = ParticipantType(participant_type)

        assigned_participants_of_type = assigned_participants.loc[[participant_type in ParticipantType(int(pt_)) for pt_ in list(assigned_participants["type"].values)]]
        #assigned_participants_of_type = assigned_participants.loc[assigned_participants["type"]==participant_type.value]
        return assigned_participants_of_type

    def check_if_any_participant_of_type_is_unassigned(self, shipcall, *args:list[ParticipantType])->bool:
        """
        given a list of input arguments, where item is a participant type, the function determines, whether at least one participant
        was assigned for the type. Function returns a boolean, whether any of the required participants in unassigned.

        This method is extensively used for the validation rules 0001, where the header is checked beforehand to identify, whether
        the respective participant type is assigned already.
        """
        assigned_participants = self.get_assigned_participants(shipcall)

        unassigned = [] # becomes a list of booleans
        for participant_type in args:
            assignments_of_type = self.get_assigned_participants_by_type(assigned_participants, participant_type=participant_type)
            unassignment = len(assignments_of_type)==0 # a participant type does not exist, when there is no match
            unassigned.append(unassignment)
        return any(unassigned) # returns a single boolean, whether ANY of the types is not assigned

    def standardize_model_str(self, model_str:str)->str:
        """check if the 'model_str' is valid and apply lowercasing to the string"""
        model_str = model_str.lower()
        assert model_str in list(self.df_dict.keys()), f"cannot find the requested 'model_str' in mysql: {model_str}"
        return model_str

    def get_data(self, id:int, model_str:str):
        """
        obtains {id} from the respective mysql database and builds a data model from that.
        the id should match the 'id'-column in the mysql schema.
        returns: data model, such as Ship, Shipcall, etc.

        e.g.,
            data = self.get_data(0,"shipcall")
            returns a Shipcall object
        """
        model_str = self.standardize_model_str(model_str)

        df = self.df_dict.get(model_str)
        data = self.df_loc_to_data_model(df, id, model_str)
        return data

    def get_all(self, model_str:str)->list:
        """
        given a model string (e.g., 'shipcall'), return a list of all
        data models of that type from the sql
        """
        model_str = self.standardize_model_str(model_str)
        all_ids = self.df_dict.get(model_str).index

        all_data = [
            self.get_data(_aid, model_str)
            for _aid in all_ids
        ]
        return all_data

    def df_loc_to_data_model(self, df, id, model_str, loc_type:str="loc"):
        if not len(df)>0:
            import warnings
            warnings.warn(f"empty dataframe in SQLHandler.df_loc_to_data_model for model type: {model_str}\n")
            return df

        # get a pandas series from the dataframe
        series = df.loc[id] if loc_type=="loc" else df.iloc[id]

        # get the respective data model object
        data_model = self.str_to_model_dict.get(model_str,None)
        assert data_model is not None, f"could not find the requested model_str: {model_str}"

        # build 'data' and fill the data model object
        # convert the 'id' to an integer, so the np.uint64 (used by pandas) is convertible to mysql
        data = {**{'id':int(id)}, **series.to_dict()} # 'id' must be added manually, as .to_dict does not contain the index, which was set with .set_index
        data = data_model(**data)
        return data

    def filter_df_by_participant_type(self, df, participant_type:typing.Union[int, ParticipantType])->pd.DataFrame:
        """
        As ParticipantTypes are Flag objects, a dataframe's integer might resemble multiple participant types simultaneously.
        This function allows for more complex filters, as the IntFlag allows more complex queries

        e.g.:
            ParticipantType(6) is 2,4 (2+4 = 6)

            Participant(2) in Participant(6) = True # 6 is both, 2 and 4
            Participant(1) in Participant(6) = False # 6 is both, 2 and 4, but not 1
        """
        if isinstance(participant_type,int):
            participant_type = ParticipantType(participant_type)
        filtered_df = df.loc[[participant_type in ParticipantType(df_pt) for df_pt in list(df["participant_type"].values)]]
        return filtered_df

    def get_times_for_participant_type(self, df_times, participant_type:int):
        filtered_series = self.filter_df_by_participant_type(df_times, participant_type)
        #filtered_series = df_times.loc[df_times["participant_type"]==participant_type]

        if len(filtered_series)==0:
            return None

        if not len(filtered_series)<=1:
            # correcting the error: ERROR:root:found multiple results
            # however, a warning will still be issued
            import warnings
            warnings.warn(f"found multiple results in function SQLHandler.get_times_for_participant_type\nConsidering only the first match!\nAffected Times Indexes: {filtered_series.index}")

        times = self.df_loc_to_data_model(filtered_series, id=0, model_str='times', loc_type="iloc") # use iloc! to retrieve the first result
        return times

    def dataframe_to_data_model_list(self, df, model_str)->list:
        model_str = self.standardize_model_str(model_str)

        all_ids = df.index
        all_data = [
            self.df_loc_to_data_model(df, _aid, model_str)
            for _aid in all_ids
        ]
        return all_data

    def get_participants(self, shipcall_id:id)->list:
        """
        given a {shipcall_id}, obtain the respective list of participants.
        when there are no participants, return a blank list

        returns: participant_id_list, where every element is an int
        """
        df = self.df_dict.get("shipcall_participant_map")
        if 'shipcall_id' in list(df.columns):
            df = df.set_index('shipcall_id', inplace=False)

        # the 'if' call is needed to ensure, that no Exception is raised, when the shipcall_id is not present in the df
        participant_id_list = df.loc[shipcall_id, "participant_id"].tolist() if shipcall_id in list(df.index) else []
        if not isinstance(participant_id_list,list):
            participant_id_list = [participant_id_list]
        return participant_id_list

    def get_times_of_shipcall(self, shipcall)->pd.DataFrame:
        df_times = self.df_dict.get('times') # -> pd.DataFrame
        df_times = df_times.loc[df_times["shipcall_id"]==shipcall.id]
        return df_times

    def get_times_for_agency(self, non_null_column=None)->pd.DataFrame:
        """
        options:
            non_null_column:
                None or str. If provided, the 'non_null_column'-column of the dataframe will be filtered,
                so only entries with provided values are returned (filters all NaN and NaT entries)
        """
        # get all times
        df_times = self.df_dict.get('times') # -> pd.DataFrame

        # filter out all NaN and NaT entries
        if non_null_column is not None:
            # in the Pandas documentation, it says for .isnull():
            # "This function takes a scalar or array-like object and indicates whether values are missing
            # (NaN in numeric arrays, None or NaN in object arrays, NaT in datetimelike)."
            df_times = df_times.loc[~df_times[non_null_column].isnull()] # NOT null filter

        # filter by the agency participant_type
        times_agency = self.filter_df_by_participant_type(df_times, ParticipantType.AGENCY.value)
        #times_agency = df_times.loc[df_times["participant_type"]==ParticipantType.AGENCY.value]
        return times_agency

    def filter_df_by_key_value(self, df, key, value)->pd.DataFrame:
        return df.loc[df[key]==value]

    def get_unique_ship_counts(self, all_df_times:pd.DataFrame, times_agency:pd.DataFrame, query:str, rounding:str="min", maximum_threshold=3):
        """given a dataframe of all agency times, get all unique ship counts, their values (datetime) and the string tags. returns a tuple (values,unique,counts)"""
        # #deprecated!
        import warnings
        warnings.warn(f"SQLHandler.get_unique_ship_counts is deprecated. Instead, please use SQLHandler.count_synchronous_shipcall_times")

        # optional: rounding
        if rounding is not None:
            all_df_times.loc[:, query] = pd.to_datetime(all_df_times.loc[:, query]).dt.round(rounding) # e.g., 'min' ---  # correcting the error: 'AttributeError: Can only use .dt accessor with datetimelike values'
            query_time_agency = pd.to_datetime(times_agency[query]).iloc[0].round(rounding)# e.g., 'min'

        # after rounding, filter {all_df_times}, so only those, which match the current query are of interest
        # takes 'times_agency' to sample, which value should match
        all_df_times = all_df_times.loc[all_df_times[query]==query_time_agency]

        # finally, count all remaining entries
        values = all_df_times.loc[:, query]

        # get unique entries and counts
        counts = len(values) # unique, counts = np.unique(values, return_counts=True)
        return counts # (values, unique, counts)

    def count_synchronous_shipcall_times(self, query_time:pd.Timestamp, all_df_times:pd.DataFrame, delta_threshold=900)->int:
        """count all times entries, which are too close to the query_time. The {delta_threshold} determines the threshold. returns counts (int)"""
        if all_df_times is None:
            all_df_times = self.df_dict.get("times")
        return get_synchronous_shipcall_times_standalone(query_time, all_df_times, delta_threshold)