* regardless of the BSMD flag, BSMD users are now able to perform shipcall PUT-requests * regardless of the BSMD flag, BSMD users are now able to perform shipcall PUT-requests * docstrings and BSMD-flag handling
480 lines
23 KiB
Python
480 lines
23 KiB
Python
import numpy as np
|
|
import pandas as pd
|
|
import pydapper
|
|
import datetime
|
|
import typing
|
|
from BreCal.schemas.model import Shipcall, Ship, Participant, Berth, User, Times, ShipcallParticipantMap
|
|
from BreCal.database.enums import ParticipantType
|
|
from BreCal.local_db import getPoolConnection
|
|
from BreCal.database.sql_queries import SQLQuery
|
|
from BreCal.schemas import model
|
|
|
|
|
|
def pandas_series_to_data_model():
|
|
return
|
|
|
|
def set_participant_type(x, participant_df)->int:
|
|
"""
|
|
when iterating over each row entry x in the shipcall_participant_map,
|
|
one can update the 'type' column by extracting the matching data from a participant dataframe
|
|
|
|
returns: participant_type
|
|
"""
|
|
participant_id = x["participant_id"]
|
|
participant_type = participant_df.loc[participant_id, "type"]
|
|
return participant_type
|
|
|
|
def get_synchronous_shipcall_times_standalone(query_time:pd.Timestamp, all_df_times:pd.DataFrame, delta_threshold=900)->int:
|
|
"""
|
|
This function counts all entries in {all_df_times}, which have the same timestamp as {query_time}.
|
|
It does so by:
|
|
1.) selecting all eta_berth & etd_berth entries
|
|
2.) measuring the timedelta towards {query_time}
|
|
3.) converting the timedelta to total absolute seconds (positive or negative time differences do not matter)
|
|
4.) applying a {delta_threshold} to identify, whether two times are too closely together
|
|
5.) counting the times, where the timedelta is below the threshold
|
|
|
|
returns: counts
|
|
"""
|
|
assert (isinstance(query_time,pd.Timestamp)) or (pd.isnull(query_time)), f"expected a timestamp. Found type: {type(query_time)} with value: {query_time}"
|
|
if pd.isnull(query_time):
|
|
return 0
|
|
|
|
# get a timedelta for each valid (not Null) time entry
|
|
time_deltas_eta = [(query_time.to_pydatetime()-time_.to_pydatetime()) for time_ in all_df_times.loc[:,"eta_berth"] if not pd.isnull(time_)]
|
|
time_deltas_etd = [(query_time.to_pydatetime()-time_.to_pydatetime()) for time_ in all_df_times.loc[:,"etd_berth"] if not pd.isnull(time_)]
|
|
|
|
# consider both, eta and etd times
|
|
time_deltas = time_deltas_eta + time_deltas_etd
|
|
|
|
# convert the timedelta to absolute total seconds
|
|
time_deltas = [abs(delta.total_seconds()) for delta in time_deltas]
|
|
|
|
# consider only those time deltas, which are <= the determined threshold
|
|
# create a list of booleans
|
|
time_deltas_filtered = [delta <= delta_threshold for delta in time_deltas]
|
|
|
|
# booleans can be added/counted in Python by using sum()
|
|
counts = sum(time_deltas_filtered) # int
|
|
return counts
|
|
|
|
def execute_sql_query_standalone(query, param={}, pooledConnection=None, model=None, command_type="query"):
|
|
"""
|
|
execute an arbitrary query with a set of parameters, return the output and convert it to a list.
|
|
when the pooled connection is rebuilt, it will be closed at the end of the function.
|
|
"""
|
|
rebuild_pooled_connection = pooledConnection is None
|
|
|
|
if rebuild_pooled_connection:
|
|
pooledConnection = getPoolConnection()
|
|
|
|
commands = pydapper.using(pooledConnection)
|
|
# participant_query = "SELECT participant_id, type FROM shipcall_participant_map WHERE shipcall_id=?shipcall_id?";
|
|
|
|
# creates a generator
|
|
try:
|
|
if command_type=="query":
|
|
if model is None:
|
|
schemas = commands.query(query, model=dict, param=param, buffered=False)
|
|
else:
|
|
schemas = commands.query(query, model=model, param=param, buffered=False)
|
|
|
|
# creates a list of results from the generator
|
|
schemas = [schema for schema in schemas]
|
|
|
|
elif command_type=="execute":
|
|
schemas = commands.execute(query, param=param)
|
|
elif command_type=="single":
|
|
sentinel = object()
|
|
|
|
# pulls a *single* row from the query. Typically, these queries require an ID within the param dictionary.
|
|
# when providing a model, such as model.Shipcall, the dataset is immediately translated into a data model.
|
|
schemas = commands.query_single_or_default(query, sentinel, param=param) if model is None else commands.query_single_or_default(query, sentinel, param=param, model=model)
|
|
if schemas is sentinel:
|
|
raise Exception("no such record")
|
|
elif command_type=="single_or_none":
|
|
sentinel = object()
|
|
|
|
# pulls a *single* row from the query. Typically, these queries require an ID within the param dictionary.
|
|
# when providing a model, such as model.Shipcall, the dataset is immediately translated into a data model.
|
|
schemas = commands.query_single_or_default(query, sentinel, param=param) if model is None else commands.query_single_or_default(query, sentinel, param=param, model=model)
|
|
schemas = None if schemas is sentinel else schemas
|
|
|
|
elif command_type=="execute_scalar":
|
|
schemas = commands.execute_scalar(query)
|
|
|
|
else:
|
|
raise ValueError(command_type)
|
|
|
|
finally: # if needed, ensure that the pooled connection is closed.
|
|
if rebuild_pooled_connection:
|
|
pooledConnection.close()
|
|
return schemas
|
|
|
|
def get_assigned_participant_of_type(shipcall_id:int, participant_type:typing.Union[int,model.ParticipantType])->typing.Optional[model.Participant]:
|
|
"""obtains the ShipcallParticipantMap of a given shipcall and finds the participant id of a desired type. Finally, returns the respective Participant"""
|
|
spm_shipcall_data = execute_sql_query_standalone(
|
|
query=SQLQuery.get_shipcall_participant_map_by_shipcall_id_and_type(),
|
|
param={"id":shipcall_id, "type":participant_type},
|
|
command_type="query") # returns a list of matches
|
|
|
|
if len(spm_shipcall_data)==0:
|
|
return None
|
|
|
|
query = 'SELECT * FROM participant WHERE id=?participant_id?'
|
|
assigned_participant = execute_sql_query_standalone(
|
|
query=query,
|
|
param={"participant_id":spm_shipcall_data[0]["participant_id"]},
|
|
model=model.Participant,
|
|
command_type="single_or_none"
|
|
) # returns a list of matches
|
|
return assigned_participant
|
|
|
|
|
|
class SQLHandler():
|
|
"""
|
|
An object that reads SQL queries from the sql_connection and stores it in pandas DataFrames. The object can read all available tables
|
|
at once into memory, when providing 'read_all=True'.
|
|
|
|
# #TODO_initialization: shipcall_tug_map, user_role_map & role_securable_map might be mapped to the respective dataframes
|
|
"""
|
|
def __init__(self, sql_connection, read_all=False):
|
|
self.sql_connection = sql_connection
|
|
self.all_schemas = self.get_all_schemas_from_mysql()
|
|
self.build_str_to_model_dict()
|
|
|
|
if read_all:
|
|
self.read_all(self.all_schemas)
|
|
|
|
def execute_sql_query(self, sql_connection, query, param):
|
|
"""
|
|
this method is best used in combination with a python context-manager, such as:
|
|
with mysql.connector.connect(**mysql_connection_data) as sql_connection:
|
|
schema = sql_handler.execute_sql_query(sql_connection, query)
|
|
"""
|
|
schemas = execute_sql_query_standalone(query, param, pooledConnection=sql_connection)
|
|
return schemas
|
|
|
|
def get_all_schemas_from_mysql(self):
|
|
with self.sql_connection.cursor(buffered=True) as cursor:
|
|
cursor.execute("SHOW TABLES")
|
|
schema = cursor.fetchall()
|
|
all_schemas = [schem[0] for schem in schema]
|
|
return all_schemas
|
|
|
|
def build_str_to_model_dict(self):
|
|
"""
|
|
creates a simple dictionary, which maps a string to a data object
|
|
e.g.,
|
|
'ship'->BreCal.schemas.model.Ship object
|
|
"""
|
|
self.str_to_model_dict = {
|
|
"shipcall":Shipcall, "ship":Ship, "participant":Participant, "berth":Berth, "user":User, "times":Times,
|
|
"shipcall_participant_map":ShipcallParticipantMap
|
|
}
|
|
return
|
|
|
|
def read_mysql_table_to_df(self, table_name:str):
|
|
"""determine a {table_name}, which will be read from a mysql server. returns a pandas DataFrame with the respective data"""
|
|
with self.sql_connection.cursor(buffered=True) as cursor: #df = pd.read_sql(sql=f"SELECT * FROM {table_name}", con=self.sql_connection)
|
|
# 1.) get the column names
|
|
cursor.execute(f"DESCRIBE {table_name}")
|
|
cols = cursor.fetchall()
|
|
column_names = [col_name[0] for col_name in cols]
|
|
|
|
# 2.) get the data tuples
|
|
cursor.execute(f"SELECT * FROM {table_name}")
|
|
data = cursor.fetchall()
|
|
|
|
# 3.) map the data tuples to the correct column names
|
|
data = [{k:v for k,v in zip(column_names, dat)} for dat in data]
|
|
|
|
# 4.) build a dataframe from the respective data models (which ensures the correct data type)
|
|
df = self.build_df_from_data_and_name(data, table_name)
|
|
return df
|
|
|
|
def build_df_from_data_and_name(self, data, table_name):
|
|
data_model = self.str_to_model_dict.get(table_name)
|
|
if data_model is not None:
|
|
df = pd.DataFrame([data_model(**dat) for dat in data], columns=list(data_model.__annotations__.keys()))
|
|
else:
|
|
df = pd.DataFrame([dat for dat in data])
|
|
return df
|
|
|
|
def mysql_to_df(self, query, table_name):
|
|
"""provide an arbitrary sql query that should be read from a mysql server {sql_connection}. returns a pandas DataFrame with the obtained data"""
|
|
with self.sql_connection.cursor(buffered=True) as cursor: # df = pd.read_sql(query, self.sql_connection).convert_dtypes()
|
|
# 1.) get the column names
|
|
cursor.execute(f"DESCRIBE {table_name}")
|
|
cols = cursor.fetchall()
|
|
column_names = [col_name[0] for col_name in cols]
|
|
|
|
# 2.) get the data tuples
|
|
cursor.execute(query)
|
|
data = cursor.fetchall()
|
|
|
|
# 3.) map the data tuples to the correct column names
|
|
data = [{k:v for k,v in zip(column_names, dat)} for dat in data]
|
|
|
|
# 4.) build a dataframe from the respective data models (which ensures the correct data type)
|
|
data_model = self.str_to_model_dict.get(table_name)
|
|
df = self.build_df_from_data_and_name(data, table_name)
|
|
if 'id' in df.columns:
|
|
df = df.set_index('id', inplace=False) # avoid inplace updates, so the raw sql remains unchanged
|
|
return df
|
|
|
|
def read_all(self, all_schemas):
|
|
# create a dictionary, which maps every mysql schema to pandas DataFrames
|
|
self.df_dict = self.build_full_mysql_df_dict(all_schemas)
|
|
|
|
# update the 'participants' column in 'shipcall'
|
|
self.initialize_shipcall_participant_list()
|
|
|
|
# update the 'type' in shipcall_participants_map
|
|
# fully deprecated
|
|
# self.add_participant_type_to_map()
|
|
return
|
|
|
|
def build_full_mysql_df_dict(self, all_schemas):
|
|
"""given a list of strings {all_schemas}, every schema will be read as individual pandas DataFrames to a dictionary with the respective keys. returns: dictionary {schema_name:pd.DataFrame}"""
|
|
mysql_df_dict = {}
|
|
for schem in all_schemas:
|
|
query = f"SELECT * FROM {schem}"
|
|
mysql_df_dict[schem] = self.mysql_to_df(query, table_name=schem)
|
|
return mysql_df_dict
|
|
|
|
def initialize_shipcall_participant_list(self):
|
|
"""
|
|
iteratively applies the .get_participants method to each shipcall.
|
|
the function updates the 'participants' column.
|
|
"""
|
|
# 1.) get all shipcalls
|
|
df = self.df_dict.get('shipcall')
|
|
|
|
# 2.) iterate over each individual shipcall, obtain the id (pandas calls it 'name')
|
|
# and apply the 'get_participants' method, which returns a list
|
|
# if the shipcall_id exists, the list contains ids
|
|
# otherwise, return a blank list
|
|
df['participants'] = df.apply(
|
|
lambda x: self.get_participants(x.name),
|
|
axis=1)
|
|
return
|
|
|
|
def add_participant_type_to_map(self):
|
|
"""
|
|
applies a lambda function, where the 'type'-column in the shipcall_participant_map is updated by reading the
|
|
respective data from the participants. Updates the shipcall_participant_map inplace.
|
|
"""
|
|
raise Exception("deprecated! Overwriting the shipcall_participant_map may cause harm, as a participant with multi-flag might be wrongfully assigned to multiple roles simultaneously.")
|
|
#spm = self.df_dict["shipcall_participant_map"]
|
|
#participant_df = self.df_dict["participant"]
|
|
|
|
#spm.loc[:,"type"] = spm.loc[:].apply(lambda x: set_participant_type(x, participant_df=participant_df),axis=1)
|
|
#self.df_dict["shipcall_participant_map"] = spm
|
|
return
|
|
|
|
def get_assigned_participants(self, shipcall)->pd.DataFrame:
|
|
"""return each participant of a respective shipcall, filtered by the shipcall id"""
|
|
# get the shipcall_participant_map
|
|
spm = self.df_dict["shipcall_participant_map"]
|
|
assigned_participants = spm.loc[spm["shipcall_id"]==shipcall.id]
|
|
return assigned_participants
|
|
|
|
def get_assigned_participants_by_type(self, assigned_participants:pd.DataFrame, participant_type:ParticipantType):
|
|
"""filters a dataframe of assigned_participants by the provided type enumerator"""
|
|
if isinstance(participant_type,int):
|
|
participant_type = ParticipantType(participant_type)
|
|
|
|
assigned_participants_of_type = assigned_participants.loc[[participant_type in ParticipantType(int(pt_)) for pt_ in list(assigned_participants["type"].values)]]
|
|
#assigned_participants_of_type = assigned_participants.loc[assigned_participants["type"]==participant_type.value]
|
|
return assigned_participants_of_type
|
|
|
|
def check_if_any_participant_of_type_is_unassigned(self, shipcall, *args:list[ParticipantType])->bool:
|
|
"""
|
|
given a list of input arguments, where item is a participant type, the function determines, whether at least one participant
|
|
was assigned for the type. Function returns a boolean, whether any of the required participants in unassigned.
|
|
|
|
This method is extensively used for the validation rules 0001, where the header is checked beforehand to identify, whether
|
|
the respective participant type is assigned already.
|
|
"""
|
|
assigned_participants = self.get_assigned_participants(shipcall)
|
|
|
|
unassigned = [] # becomes a list of booleans
|
|
for participant_type in args:
|
|
assignments_of_type = self.get_assigned_participants_by_type(assigned_participants, participant_type=participant_type)
|
|
unassignment = len(assignments_of_type)==0 # a participant type does not exist, when there is no match
|
|
unassigned.append(unassignment)
|
|
return any(unassigned) # returns a single boolean, whether ANY of the types is not assigned
|
|
|
|
def standardize_model_str(self, model_str:str)->str:
|
|
"""check if the 'model_str' is valid and apply lowercasing to the string"""
|
|
model_str = model_str.lower()
|
|
assert model_str in list(self.df_dict.keys()), f"cannot find the requested 'model_str' in mysql: {model_str}"
|
|
return model_str
|
|
|
|
def get_data(self, id:int, model_str:str):
|
|
"""
|
|
obtains {id} from the respective mysql database and builds a data model from that.
|
|
the id should match the 'id'-column in the mysql schema.
|
|
returns: data model, such as Ship, Shipcall, etc.
|
|
|
|
e.g.,
|
|
data = self.get_data(0,"shipcall")
|
|
returns a Shipcall object
|
|
"""
|
|
model_str = self.standardize_model_str(model_str)
|
|
|
|
df = self.df_dict.get(model_str)
|
|
data = self.df_loc_to_data_model(df, id, model_str)
|
|
return data
|
|
|
|
def get_all(self, model_str:str)->list:
|
|
"""
|
|
given a model string (e.g., 'shipcall'), return a list of all
|
|
data models of that type from the sql
|
|
"""
|
|
model_str = self.standardize_model_str(model_str)
|
|
all_ids = self.df_dict.get(model_str).index
|
|
|
|
all_data = [
|
|
self.get_data(_aid, model_str)
|
|
for _aid in all_ids
|
|
]
|
|
return all_data
|
|
|
|
def df_loc_to_data_model(self, df, id, model_str, loc_type:str="loc"):
|
|
if not len(df)>0:
|
|
import warnings
|
|
warnings.warn(f"empty dataframe in SQLHandler.df_loc_to_data_model for model type: {model_str}\n")
|
|
return df
|
|
|
|
# get a pandas series from the dataframe
|
|
series = df.loc[id] if loc_type=="loc" else df.iloc[id]
|
|
|
|
# get the respective data model object
|
|
data_model = self.str_to_model_dict.get(model_str,None)
|
|
assert data_model is not None, f"could not find the requested model_str: {model_str}"
|
|
|
|
# build 'data' and fill the data model object
|
|
# convert the 'id' to an integer, so the np.uint64 (used by pandas) is convertible to mysql
|
|
data = {**{'id':int(id)}, **series.to_dict()} # 'id' must be added manually, as .to_dict does not contain the index, which was set with .set_index
|
|
data = data_model(**data)
|
|
return data
|
|
|
|
def filter_df_by_participant_type(self, df, participant_type:typing.Union[int, ParticipantType])->pd.DataFrame:
|
|
"""
|
|
As ParticipantTypes are Flag objects, a dataframe's integer might resemble multiple participant types simultaneously.
|
|
This function allows for more complex filters, as the IntFlag allows more complex queries
|
|
|
|
e.g.:
|
|
ParticipantType(6) is 2,4 (2+4 = 6)
|
|
|
|
Participant(2) in Participant(6) = True # 6 is both, 2 and 4
|
|
Participant(1) in Participant(6) = False # 6 is both, 2 and 4, but not 1
|
|
"""
|
|
if isinstance(participant_type,int):
|
|
participant_type = ParticipantType(participant_type)
|
|
filtered_df = df.loc[[participant_type in ParticipantType(df_pt) for df_pt in list(df["participant_type"].values)]]
|
|
return filtered_df
|
|
|
|
def get_times_for_participant_type(self, df_times, participant_type:int):
|
|
filtered_series = self.filter_df_by_participant_type(df_times, participant_type)
|
|
#filtered_series = df_times.loc[df_times["participant_type"]==participant_type]
|
|
|
|
if len(filtered_series)==0:
|
|
return None
|
|
|
|
if not len(filtered_series)<=1:
|
|
# correcting the error: ERROR:root:found multiple results
|
|
# however, a warning will still be issued
|
|
import warnings
|
|
warnings.warn(f"found multiple results in function SQLHandler.get_times_for_participant_type\nConsidering only the first match!\nAffected Times Indexes: {filtered_series.index}")
|
|
|
|
times = self.df_loc_to_data_model(filtered_series, id=0, model_str='times', loc_type="iloc") # use iloc! to retrieve the first result
|
|
return times
|
|
|
|
def dataframe_to_data_model_list(self, df, model_str)->list:
|
|
model_str = self.standardize_model_str(model_str)
|
|
|
|
all_ids = df.index
|
|
all_data = [
|
|
self.df_loc_to_data_model(df, _aid, model_str)
|
|
for _aid in all_ids
|
|
]
|
|
return all_data
|
|
|
|
def get_participants(self, shipcall_id:id)->list:
|
|
"""
|
|
given a {shipcall_id}, obtain the respective list of participants.
|
|
when there are no participants, return a blank list
|
|
|
|
returns: participant_id_list, where every element is an int
|
|
"""
|
|
df = self.df_dict.get("shipcall_participant_map")
|
|
if 'shipcall_id' in list(df.columns):
|
|
df = df.set_index('shipcall_id', inplace=False)
|
|
|
|
# the 'if' call is needed to ensure, that no Exception is raised, when the shipcall_id is not present in the df
|
|
participant_id_list = df.loc[shipcall_id, "participant_id"].tolist() if shipcall_id in list(df.index) else []
|
|
if not isinstance(participant_id_list,list):
|
|
participant_id_list = [participant_id_list]
|
|
return participant_id_list
|
|
|
|
def get_times_of_shipcall(self, shipcall)->pd.DataFrame:
|
|
df_times = self.df_dict.get('times') # -> pd.DataFrame
|
|
df_times = df_times.loc[df_times["shipcall_id"]==shipcall.id]
|
|
return df_times
|
|
|
|
def get_times_for_agency(self, non_null_column=None)->pd.DataFrame:
|
|
"""
|
|
options:
|
|
non_null_column:
|
|
None or str. If provided, the 'non_null_column'-column of the dataframe will be filtered,
|
|
so only entries with provided values are returned (filters all NaN and NaT entries)
|
|
"""
|
|
# get all times
|
|
df_times = self.df_dict.get('times') # -> pd.DataFrame
|
|
|
|
# filter out all NaN and NaT entries
|
|
if non_null_column is not None:
|
|
# in the Pandas documentation, it says for .isnull():
|
|
# "This function takes a scalar or array-like object and indicates whether values are missing
|
|
# (NaN in numeric arrays, None or NaN in object arrays, NaT in datetimelike)."
|
|
df_times = df_times.loc[~df_times[non_null_column].isnull()] # NOT null filter
|
|
|
|
# filter by the agency participant_type
|
|
times_agency = self.filter_df_by_participant_type(df_times, ParticipantType.AGENCY.value)
|
|
#times_agency = df_times.loc[df_times["participant_type"]==ParticipantType.AGENCY.value]
|
|
return times_agency
|
|
|
|
def filter_df_by_key_value(self, df, key, value)->pd.DataFrame:
|
|
return df.loc[df[key]==value]
|
|
|
|
def get_unique_ship_counts(self, all_df_times:pd.DataFrame, times_agency:pd.DataFrame, query:str, rounding:str="min", maximum_threshold=3):
|
|
"""given a dataframe of all agency times, get all unique ship counts, their values (datetime) and the string tags. returns a tuple (values,unique,counts)"""
|
|
# #deprecated!
|
|
import warnings
|
|
warnings.warn(f"SQLHandler.get_unique_ship_counts is deprecated. Instead, please use SQLHandler.count_synchronous_shipcall_times")
|
|
|
|
# optional: rounding
|
|
if rounding is not None:
|
|
all_df_times.loc[:, query] = pd.to_datetime(all_df_times.loc[:, query]).dt.round(rounding) # e.g., 'min' --- # correcting the error: 'AttributeError: Can only use .dt accessor with datetimelike values'
|
|
query_time_agency = pd.to_datetime(times_agency[query]).iloc[0].round(rounding)# e.g., 'min'
|
|
|
|
# after rounding, filter {all_df_times}, so only those, which match the current query are of interest
|
|
# takes 'times_agency' to sample, which value should match
|
|
all_df_times = all_df_times.loc[all_df_times[query]==query_time_agency]
|
|
|
|
# finally, count all remaining entries
|
|
values = all_df_times.loc[:, query]
|
|
|
|
# get unique entries and counts
|
|
counts = len(values) # unique, counts = np.unique(values, return_counts=True)
|
|
return counts # (values, unique, counts)
|
|
|
|
def count_synchronous_shipcall_times(self, query_time:pd.Timestamp, all_df_times:pd.DataFrame, delta_threshold=900)->int:
|
|
"""count all times entries, which are too close to the query_time. The {delta_threshold} determines the threshold. returns counts (int)"""
|
|
if all_df_times is None:
|
|
all_df_times = self.df_dict.get("times")
|
|
return get_synchronous_shipcall_times_standalone(query_time, all_df_times, delta_threshold)
|