From bab8eff47bc103d6558df1fe3367f00b6e5b5299 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Assun=C3=A7ao=20Jeshon?= Date: Fri, 21 Sep 2018 15:19:35 +0200 Subject: [PATCH 1/2] Create file which contains function to load .mat files into DF and separate it into training/validation/test --- extraction/retrieve_battery_data.py | 2 +- load_data.py | 162 ++++++++++++++++++++++++++++ 2 files changed, 163 insertions(+), 1 deletion(-) create mode 100644 load_data.py diff --git a/extraction/retrieve_battery_data.py b/extraction/retrieve_battery_data.py index 329c0df..9e557e2 100644 --- a/extraction/retrieve_battery_data.py +++ b/extraction/retrieve_battery_data.py @@ -23,7 +23,7 @@ def _split_mat(mat): def _load_mat_file(filepath): """Load the matlab file""" - filename = filepath.split("/")[-1].split(".")[0] + filename = os.path.split(filepath)[-1].split(".")[0] raw_mat = scipy.io.loadmat(filepath) mat = raw_mat[filename][0][0][0][0] diff --git a/load_data.py b/load_data.py new file mode 100644 index 0000000..a6fc28b --- /dev/null +++ b/load_data.py @@ -0,0 +1,162 @@ +import pandas as pd +import numpy as np +import os +import random +from extraction.retrieve_battery_data import extract_charge_discharge_impedance + + +def build_files(folder_to_exclude, src_dir=os.getcwd()): + data_folder = os.path.join(src_dir, 'data') + + dict_files = {} + for directory in os.listdir(data_folder): + fullpath_dir = os.path.join(data_folder, directory) + if os.path.isdir(fullpath_dir) and directory not in folder_to_exclude: + dict_files[directory] = [os.path.join(fullpath_dir, file) for file + in os.listdir(fullpath_dir) if + file.endswith('.mat')] + + return dict_files + + +def mat_to_pandas(files, bat_to_keep): + print("Loading mat files ....") + print("Please, wait a bit.") + + df = pd.DataFrame() + for folder_name, filepaths in files.items(): + for filepath in filepaths: + battery_nb = int( + os.path.splitext(os.path.basename(filepath))[0].replace('B', '') + ) + charge_nb = 1 + discharge_nb = 1 + + if battery_nb in bat_to_keep: + charge_items, discharge_items, impedance_items = extract_charge_discharge_impedance( + filepath) + + for charge in charge_items: + df_charge = pd.DataFrame( + columns=['battery_nb', 'datetime', 'charge_nb', + 'voltage_measured', 'current_measured', + 'temperature_measured', 'current_charge', + 'voltage_charge', 'ambiant_temp'] + ) + df_charge['voltage_measured'] = charge['voltage_measured'] + df_charge['current_measured'] = charge['current_measured'] + df_charge['temperature_measured'] = charge[ + 'temperature_measured'] + df_charge['current_charge'] = charge['current_charge'] + df_charge['voltage_charge'] = charge['voltage_charge'] + df_charge['ambiant_temp'] = charge['ambiant_temp'] + df_charge['battery_nb'] = [battery_nb] * len( + charge['voltage_measured']) + df_charge['charge_nb'] = [charge_nb] * len( + charge['voltage_measured']) + df_charge['datetime'] = pd.Timestamp( + charge['datetime']) + pd.to_timedelta(charge['time'][0], + unit='s') + df = df.append(df_charge, ignore_index=True, sort=False) + + charge_nb = charge_nb + 1 + + for discharge in discharge_items: + df_discharge = pd.DataFrame( + columns=['battery_nb', 'datetime', 'discharge_nb', + 'voltage_measured', 'current_measured', + 'temperature_measured', 'current_charge', + 'voltage_charge', 'capacity', 'ambiant_temp'] + ) + df_discharge['voltage_measured'] = discharge[ + 'voltage_measured'] + df_discharge['current_measured'] = discharge[ + 'current_measured'] + df_discharge['temperature_measured'] = discharge[ + 'temperature_measured'] + df_discharge['current_charge'] = discharge['current_charge'] + df_discharge['voltage_charge'] = discharge['voltage_charge'] + df_discharge['capacity'] = discharge['capacity'][0] if len( + discharge['capacity']) > 0 else [np.NaN] * len( + discharge['voltage_measured']) + df_discharge['ambiant_temp'] = discharge['ambiant_temp'] + df_discharge['battery_nb'] = [battery_nb] * len( + discharge['voltage_measured']) + df_discharge['discharge_nb'] = [discharge_nb] * len( + discharge['voltage_measured']) + df_discharge['datetime'] = pd.Timestamp( + discharge['datetime']) + pd.to_timedelta( + discharge['time'], unit='s') + df = df.append(df_discharge, ignore_index=True, sort=False) + + discharge_nb = discharge_nb + 1 + + print("Datas loaded !") + + return df + +def get_splitted_datas(df, training_size=4, validation_size=1, test_size=1): + # Check if args are numbers + try: + training_size = int(training_size) + validation_size = int(validation_size) + test_size = int(test_size) + except: + print( + "Please, enter int numbers for training_size, validation_size and test_size") + return + + # Check if sum of args is <= battery_nb + battery_nb = df['battery_nb'].unique().tolist() + if training_size + validation_size + test_size > len(battery_nb): + print( + "Ooops, size of training_size+validation_size+test_size > numbers of battery available.") + return + + # Shuffle the list of battery_nb + battery_nb = random.sample(battery_nb, len(battery_nb)) + + # Get the batteries nb and return DFs + training_bat = battery_nb[0:training_size] + validation_bat = battery_nb[training_size:training_size + validation_size] + test_bat = battery_nb[training_size + validation_size: training_size + validation_size + test_size] + + return df[df['battery_nb'].isin(training_bat)],\ + df[df['battery_nb'].isin(validation_bat)], \ + df[df['battery_nb'].isin(test_bat)] + + +def set_labels_Y(df): + len_df = len(df) + + # Sort the DF by time + df.sort_values(by=['datetime'], inplace=True) + + # Create labels array + good_array = [1] * (len_df // 3) # 1 = Good quality + middle_array = [2] * (len_df // 3) # 1 = Middle quality + bad_array = [3] * (len_df // 3) # 1 = Bad quality + + missing_datas_nb = len_df - ( + len(good_array) + len(middle_array) + len(bad_array)) + good_array = good_array + ([1] * missing_datas_nb) + + # Insert labels to df + df['quality'] = good_array + middle_array + bad_array + + return df + + +def main(): + batteries_to_keep = [25, 26, 27, 28, 33, 34] + folder_to_exclude = ['BatteryAgingARC_25_26_27_28_P1'] + dict_files = build_files(folder_to_exclude) + + df = mat_to_pandas(dict_files, batteries_to_keep) + + df = set_labels_Y(df) + + df_training, df_validation, df_test = get_splitted_datas(df) + +if __name__ == "__main__": + main() \ No newline at end of file -- GitLab From 767c5c5cfdf2477f12892747981a7450b69f353a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Assun=C3=A7ao=20Jeshon?= Date: Mon, 24 Sep 2018 10:51:05 +0200 Subject: [PATCH 2/2] Fix some reviews --- load_data.py | 111 ++++++++++++++++++++++++++++----------------------- 1 file changed, 61 insertions(+), 50 deletions(-) diff --git a/load_data.py b/load_data.py index a6fc28b..6ed6c0d 100644 --- a/load_data.py +++ b/load_data.py @@ -19,6 +19,61 @@ def build_files(folder_to_exclude, src_dir=os.getcwd()): return dict_files +def extract_charge_items(charge, battery_nb, charge_nb): + df_charge = pd.DataFrame( + columns=['battery_nb', 'datetime', 'charge_nb', + 'voltage_measured', 'current_measured', + 'temperature_measured', 'current_charge', + 'voltage_charge', 'ambiant_temp'] + ) + df_charge['voltage_measured'] = charge['voltage_measured'] + df_charge['current_measured'] = charge['current_measured'] + df_charge['temperature_measured'] = charge[ + 'temperature_measured'] + df_charge['current_charge'] = charge['current_charge'] + df_charge['voltage_charge'] = charge['voltage_charge'] + df_charge['ambiant_temp'] = charge['ambiant_temp'] + df_charge['battery_nb'] = [battery_nb] * len( + charge['voltage_measured']) + df_charge['charge_nb'] = [charge_nb] * len( + charge['voltage_measured']) + df_charge['datetime'] = pd.Timestamp( + charge['datetime']) + pd.to_timedelta(charge['time'][0], + unit='s') + + return df_charge + + +def extract_discharge_items(discharge, battery_nb, discharge_nb): + df_discharge = pd.DataFrame( + columns=['battery_nb', 'datetime', 'discharge_nb', + 'voltage_measured', 'current_measured', + 'temperature_measured', 'current_charge', + 'voltage_charge', 'capacity', 'ambiant_temp'] + ) + df_discharge['voltage_measured'] = discharge[ + 'voltage_measured'] + df_discharge['current_measured'] = discharge[ + 'current_measured'] + df_discharge['temperature_measured'] = discharge[ + 'temperature_measured'] + df_discharge['current_charge'] = discharge['current_charge'] + df_discharge['voltage_charge'] = discharge['voltage_charge'] + df_discharge['capacity'] = discharge['capacity'][0] if len( + discharge['capacity']) > 0 else [np.NaN] * len( + discharge['voltage_measured']) + df_discharge['ambiant_temp'] = discharge['ambiant_temp'] + df_discharge['battery_nb'] = [battery_nb] * len( + discharge['voltage_measured']) + df_discharge['discharge_nb'] = [discharge_nb] * len( + discharge['voltage_measured']) + df_discharge['datetime'] = pd.Timestamp( + discharge['datetime']) + pd.to_timedelta( + discharge['time'], unit='s') + + return df_discharge + + def mat_to_pandas(files, bat_to_keep): print("Loading mat files ....") print("Please, wait a bit.") @@ -37,56 +92,13 @@ def mat_to_pandas(files, bat_to_keep): filepath) for charge in charge_items: - df_charge = pd.DataFrame( - columns=['battery_nb', 'datetime', 'charge_nb', - 'voltage_measured', 'current_measured', - 'temperature_measured', 'current_charge', - 'voltage_charge', 'ambiant_temp'] - ) - df_charge['voltage_measured'] = charge['voltage_measured'] - df_charge['current_measured'] = charge['current_measured'] - df_charge['temperature_measured'] = charge[ - 'temperature_measured'] - df_charge['current_charge'] = charge['current_charge'] - df_charge['voltage_charge'] = charge['voltage_charge'] - df_charge['ambiant_temp'] = charge['ambiant_temp'] - df_charge['battery_nb'] = [battery_nb] * len( - charge['voltage_measured']) - df_charge['charge_nb'] = [charge_nb] * len( - charge['voltage_measured']) - df_charge['datetime'] = pd.Timestamp( - charge['datetime']) + pd.to_timedelta(charge['time'][0], - unit='s') + df_charge = extract_charge_items(charge, battery_nb, charge_nb) df = df.append(df_charge, ignore_index=True, sort=False) charge_nb = charge_nb + 1 for discharge in discharge_items: - df_discharge = pd.DataFrame( - columns=['battery_nb', 'datetime', 'discharge_nb', - 'voltage_measured', 'current_measured', - 'temperature_measured', 'current_charge', - 'voltage_charge', 'capacity', 'ambiant_temp'] - ) - df_discharge['voltage_measured'] = discharge[ - 'voltage_measured'] - df_discharge['current_measured'] = discharge[ - 'current_measured'] - df_discharge['temperature_measured'] = discharge[ - 'temperature_measured'] - df_discharge['current_charge'] = discharge['current_charge'] - df_discharge['voltage_charge'] = discharge['voltage_charge'] - df_discharge['capacity'] = discharge['capacity'][0] if len( - discharge['capacity']) > 0 else [np.NaN] * len( - discharge['voltage_measured']) - df_discharge['ambiant_temp'] = discharge['ambiant_temp'] - df_discharge['battery_nb'] = [battery_nb] * len( - discharge['voltage_measured']) - df_discharge['discharge_nb'] = [discharge_nb] * len( - discharge['voltage_measured']) - df_discharge['datetime'] = pd.Timestamp( - discharge['datetime']) + pd.to_timedelta( - discharge['time'], unit='s') + df_discharge = extract_discharge_items(discharge, battery_nb, discharge_nb) df = df.append(df_discharge, ignore_index=True, sort=False) discharge_nb = discharge_nb + 1 @@ -134,11 +146,10 @@ def set_labels_Y(df): # Create labels array good_array = [1] * (len_df // 3) # 1 = Good quality - middle_array = [2] * (len_df // 3) # 1 = Middle quality - bad_array = [3] * (len_df // 3) # 1 = Bad quality + middle_array = [2] * (len_df // 3) # 2 = Middle quality + bad_array = [3] * (len_df // 3) # 3 = Bad quality - missing_datas_nb = len_df - ( - len(good_array) + len(middle_array) + len(bad_array)) + missing_datas_nb = len_df - ((len_df // 3) * 3) good_array = good_array + ([1] * missing_datas_nb) # Insert labels to df @@ -159,4 +170,4 @@ def main(): df_training, df_validation, df_test = get_splitted_datas(df) if __name__ == "__main__": - main() \ No newline at end of file + main() -- GitLab