Skip to content
Snippets Groups Projects
Verified Commit 7707720e authored by Etienne Frank's avatar Etienne Frank
Browse files

WIP: Explore the automl

parent 5e5a3646
No related tags found
No related merge requests found
Pipeline #1452 failed
output_graphs/ output_graphs/
batteries_data_temp/
results/
current_results/
# Byte-compiled / optimized / DLL files # Byte-compiled / optimized / DLL files
......
before_script: # before_script:
- pipenv install # - pkill -f apt
# - apt install -y python-dev
lint: # - apt install -y python3-dev
tags: # - pipenv install
- ubuntu-docker #
script: # lint:
- pipenv run flake8 # tags:
# - ubuntu-docker
# script:
# - pipenv run flake8
...@@ -10,6 +10,12 @@ matplotlib = "*" ...@@ -10,6 +10,12 @@ matplotlib = "*"
pillow = "*" pillow = "*"
"flake8" = "*" "flake8" = "*"
pylint = "*" pylint = "*"
cython = "*"
pandas = "*"
pytest = "*"
openpyxl = "*"
"auto-sklearn2" = {ref = "8bdcba15caa28cb4336d9cb6ee4108078ab6d8a2", git = "git://github.com/automl/auto-sklearn.git"}
auto-sklearn = "*"
[dev-packages] [dev-packages]
......
This diff is collapsed.
import shutil
import glob
import pickle
import os
import traceback
import time
import autosklearn.classification
import sys
from extraction.retrieve_battery_data import extract_charge_discharge_impedance
from functools import reduce
import itertools
import numpy as np
import sklearn.model_selection
import sklearn.datasets
import sklearn.metrics
import multiprocessing
from concurrent.futures import ThreadPoolExecutor
from autosklearn.constants import *
from autosklearn.classification import AutoSklearnClassifier
from autosklearn.metrics import accuracy
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score
from sklearn.externals import joblib
from sklearn.metrics import confusion_matrix
import pandas as pd
# import concurrent.futures
# np.set_printoptions(threshold=np.nan)
SEPARATOR = "-----------------------------------"
RESULTS_FOLDER = "current_results"
MODEL_FILE_SEP = "___"
def chunk_it(seq, num):
avg = len(seq) / float(num)
last = 0
out = []
while last <= num:
res = [last] * (len(seq[int(last*avg):int((last + 1) * avg)]))
out += res
last += 1
return out
# DISCRETISATION_TYPES = ["mean", "median", "standard-deviation"]
DISCRETISATION_TYPES = [
"mean", "gradiant_mean",
"median", "gradiant_median",
"standard-deviation", "gradiant_std",
]
BATTERY_BASE = "./data/BatteryAgingARC_25-44/"
TEST_BATTERIES = [BATTERY_BASE + "B0025.mat"]
ALL_BATTERIES = [
# BATTERY_BASE + "B0025.mat",
BATTERY_BASE + "B0026.mat",
BATTERY_BASE + "B0027.mat",
BATTERY_BASE + "B0028.mat",
# BATTERY_BASE + "B0033.mat",
# BATTERY_BASE + "B0034.mat",
]
charges_params = [
"voltage_measured",
"current_measured",
"temperature_measured",
"current_charge",
"voltage_charge",
]
discharges_params = charges_params + ["capacity"] # capacity not same length
impedance_params = [
# "re", # float_
# "rct", # float_
"sense_current", # Complexes
"battery_current", # Complexes
"current_ratio", # Complexes
"battery_impedance", # Complexes but multiple array of 1 complexe
# "rectified_impedance", # Complexes but multiple array of 1 complexe # not same length
]
def discretize(array, discretisation):
# array split should copy when split > lenght instead of empty arrays
splitted = np.array_split(array, discretisation["split"])
out = []
if "mean" in discretisation["types"]:
out.append([np.mean(split) for split in splitted])
if "median" in discretisation["types"]:
out.append([np.median(split) for split in splitted])
if "standard deviation" in discretisation["types"]:
out.append([np.std(split) for split in splitted])
splitted_gradient = np.array_split(np.gradient(array), discretisation["split"])
if "gradiant_mean" in discretisation["types"]:
out.append([np.mean(split) for split in splitted_gradient])
if "gradiant_median" in discretisation["types"]:
out.append([np.median(split) for split in splitted_gradient])
if "gradiant_std" in discretisation["types"]:
out.append([np.std(split) for split in splitted_gradient])
return out
# out = np.array(out)
# return out.T
def append_params(result_cycles, cycles, used_parameters, discretisation):
"""One line contain 1 moment of every feature"""
total_cycles_length = 0
for cycle in cycles:
# # Raw
# cycle_length = len(cycle[used_parameters[0]])
# total_cycles_length += cycle_length
# for i in range(cycle_length):
# time_slice = []
# for param in used_parameters:
# time_slice.append(np.nan_to_num(cycle[param][i]))
# result_cycles.append(time_slice)
# Discretisation
cycle_length = len(cycle[used_parameters[0]])
if cycle_length < discretisation["split"]:
raise "Discretisation split is too big in comparison with cycle length"
discretized = []
for param in used_parameters:
np.nan_to_num(cycle[param], copy=False)
mean_median_std = discretize(cycle[param], discretisation)
discretized += mean_median_std
transposed_discretized = np.array(discretized).T
for time_slice in transposed_discretized:
result_cycles.append(time_slice)
total_cycles_length += len(transposed_discretized)
# CycleWay
# reduce all features together but before, each feature length N should be reduced to a fixed length
# result_cycles.append(np.nan_to_num(np.array(
# reduce((lambda acc, param: acc + cycle[param].tolist()), used_parameters, [])
# )))
return total_cycles_length
def combinations(array):
return list(itertools.chain(*[itertools.combinations(array,i+1) for i,_ in enumerate(array)]))
def get_charges(filepath):
charge_cycles, discharge_cycles, impedance_cycles = extract_charge_discharge_impedance(filepath)
return charge_cycles
def get_discharges(filepath):
charge_cycles, discharge_cycles, impedance_cycles = extract_charge_discharge_impedance(filepath)
return discharge_cycles
def get_impedance(filepath):
charge_cycles, discharge_cycles, impedance_cycles = extract_charge_discharge_impedance(filepath)
return impedance_cycles
def load_batteries(extract_function, params, all_batteries=ALL_BATTERIES, class_count=-3, discretisation=None):
all_batteries_number = [bat.split(os.sep)[-1].split(".")[0] for bat in all_batteries]
# cycles_filename = "data__" + "-".join(params) + "__" + "-".join(all_batteries_number) + "__" + str(class_count) + ".p"
cycles_filename = "__".join([
"batteries_data_temp/data",
"-".join(params),
"-".join(all_batteries_number),
str(class_count),
"-".join(discretisation["types"]),
str(discretisation["split"]),
]) + ".p"
X_cycles = []
y_cycles = []
if os.path.exists(cycles_filename):
loaded = pickle.load( open(cycles_filename, "rb") )
return loaded["x"], loaded["y"]
else:
for filepath in all_batteries:
cycles = extract_function(filepath)
# charge_cycles, discharge_cycles, impedance_cycles = extract_charge_discharge_impedance(filepath)
total_cycles_length = append_params(X_cycles, cycles, params, discretisation)
# add labels
y_cycles += chunk_it([-1]*total_cycles_length, class_count) # cycle way
# X_cycles = np.zeros([len(X_cycles),len(max(X_cycles,key = lambda x: len(x)))]) # padding #CycleWay
X_cycles = np.array(X_cycles)
y_cycles = np.array(y_cycles)
pickle.dump( {"x": X_cycles, "y": y_cycles} , open( cycles_filename, "wb" ) )
return X_cycles, y_cycles
def save_dataframe(cv_results, model_file_name):
df = pd.DataFrame(cv_results)
writer = pd.ExcelWriter(model_file_name + ".xlsx")
df.to_excel(writer, "automl_dataframe")
writer.save()
def auto_ML(options):
title, params, extract_function, class_count, discretisation, folds = options
print("--------------------------------------------------------------------------------")
print(title)
print(params)
print("--------------------------------------------------------------------------------")
X_cycles, y_cycles = load_batteries(extract_function, params, ALL_BATTERIES, class_count, discretisation)
# y_cycles = chunk_it(X_cycles, class_count) can be removed
# y_cycles = np.array(y_cycles) can be removed
print(X_cycles.shape)
print(y_cycles.shape)
# before split and are OK
# 40'000 inputs and 6 classes CV !not certain!
# 930'000 inputs and 6 classes holdout !not certain!
# 25'860 inputs and 3 classes CV
MAX_SIZE = 258600000
X_train, X_test, y_train, y_test = \
sklearn.model_selection.train_test_split(X_cycles, y_cycles, random_state=1)
X_train = X_train[:MAX_SIZE]
y_train = y_train[:MAX_SIZE]
# 0.3 is quite good
# k = 0.3
k = 0.5
automl = autosklearn.classification.AutoSklearnClassifier(
# time_left_for_this_task=69,
# per_run_time_limit=35,
time_left_for_this_task=int(3600*k),
per_run_time_limit=int(360*k),
ensemble_size=int(50),
ensemble_nbest=int(200),
# ml_memory_limit=1024,
# shared_mode=True,
# ensemble_size=50,
# ensemble_nbest=200,
# tmp_folder=tmp_folder,
# TODO Use CV instead of HOLDOUT
# resampling_strategy='holdout',
resampling_strategy='cv',
resampling_strategy_arguments={'folds': folds},
# output_folder=output_folder,
# initial_configurations_via_metalearning=0,
# seed=SEED,
# time_left_for_this_task=3600*18,
# per_run_time_limit=360*18,
ml_memory_limit=28024,
# delete_tmp_folder_after_terminate=False,
# delete_output_folder_after_terminate=False,
)
automl.fit(X_train.copy(), y_train.copy())
if automl.resampling_strategy == "cv":
automl.refit(X_train.copy(), y_train.copy())
y_pred = automl.predict(X_test, n_jobs=-1)
model_file_name = os.path.join(RESULTS_FOLDER, MODEL_FILE_SEP.join([
title,
"-".join(params),
str(class_count),
"-".join(discretisation["types"]),
str(discretisation["split"]),
"folds" + str(folds),
]) + ".joblib")
print(model_file_name)
save_dataframe(automl.cv_results_, model_file_name[:-7])
joblib.dump(automl, model_file_name)
test_battery_result = test_model_with_test_battery(model_file_name)
results = [
title,
"Params: " + str(params),
"Discretisation: " + str(discretisation),
"Class count: " + str(class_count),
"Folds: " + str(folds),
"Accuracy score: " + str(sklearn.metrics.accuracy_score(y_test, y_pred)),
"F1 score: " + str(precision_recall_fscore_support(y_test, y_pred, average="weighted")),
str(confusion_matrix(y_test, y_pred)),
] + test_battery_result + [
str(automl.sprint_statistics()),
str(automl.show_models()),
# str(automl.cv_results_),
]
print(results)
return results
def get_options(min_length=0):
class_count = 3
# 10, 50, 100, 200
charge_options = []
discharge_options = []
impedance_options = []
# charge_options = [("charge", param, get_charges, class_count) for param in combinations(charges_params) if len(param) >= min_length]
# charge_options = [("charge", charges_params, get_charges, class_count, {"types": discre_type, "split":100}) for discre_type in combinations(DISCRETISATION_TYPES)]
# discharge_options = [("discharge", charges_params, get_discharges, class_count, {"types": discre_type, "split":100}) for discre_type in combinations(DISCRETISATION_TYPES)]
# discharge_options = [("discharge", param, get_discharges, class_count) for param in combinations(charges_params) if len(param) >= min_length]
# impedance_options = [("impedance", param, get_impedance, class_count) for param in combinations(impedance_params) if len(param) >= min_length]
# charge_options = [("charge", charges_params, get_charges, class_count, {"types": DISCRETISATION_TYPES, "split":split}) for split in range(1,11)]
charge_options = [
# ("charge", charges_params, get_charges, class_count, {"types": DISCRETISATION_TYPES, "split":10}, 10),
("charge", charges_params, get_charges, class_count, {"types": DISCRETISATION_TYPES, "split":6}, 5),
("charge", charges_params, get_charges, class_count, {"types": DISCRETISATION_TYPES, "split":6}, 10),
("charge", charges_params, get_charges, class_count, {"types": DISCRETISATION_TYPES, "split":6}, 15),
("charge", charges_params, get_charges, class_count, {"types": DISCRETISATION_TYPES, "split":6}, 20),
# ("charge", charges_params, get_charges, class_count, {"types": DISCRETISATION_TYPES, "split":50}, 10),
# ("charge", charges_params, get_charges, class_count, {"types": DISCRETISATION_TYPES, "split":100}, 10),
# # ("charge", charges_params, get_charges, class_count, {"types": DISCRETISATION_TYPES, "split":1}, 10),
# ("charge", charges_params, get_charges, class_count, {"types": DISCRETISATION_TYPES, "split":200}, 10),
]
# discharge_options = [("discharge", charges_params, get_discharges, class_count, {"types": DISCRETISATION_TYPES, "split":100})]
# impedance_options = [("impedance", impedance_params, get_impedance, class_count, {"types": DISCRETISATION_TYPES, "split":20})]
all_options = discharge_options + impedance_options + charge_options
return all_options
def writeln(file, text):
file.write(text + "\n")
def init_folder():
if os.path.exists(RESULTS_FOLDER):
shutil.rmtree(RESULTS_FOLDER)
if not os.path.exists(RESULTS_FOLDER):
os.makedirs(RESULTS_FOLDER)
results_file = os.path.join(RESULTS_FOLDER, "result.txt")
failed_file = os.path.join(RESULTS_FOLDER, "failed.txt")
open(results_file, 'w').close()
open(failed_file, 'w').close()
return results_file, failed_file
def execute_auto_ML():
start_time = time.time()
print(start_time)
all_options = get_options(min_length=0)
results_file, failed_file = init_folder()
# results = list(map(auto_ML, all_options))
for option in all_options:
try:
result = auto_ML(option)
with open(results_file, 'a') as the_file:
writeln(the_file, SEPARATOR)
for line in result:
writeln(the_file, str(line))
writeln(the_file, SEPARATOR)
except:
with open(failed_file, 'a') as the_file:
writeln(the_file, SEPARATOR)
writeln(the_file, str(option))
writeln(the_file, str(traceback.format_exc()))
writeln(the_file, SEPARATOR)
seconds = time.time() - start_time
print("execution time", int(seconds / 60 / 60), "h", int(seconds / 60) % 60, "m", int(seconds) % 60, "s")
def test_model_with_test_battery(model_filepath):
batteries = TEST_BATTERIES
clean_model_filepath = model_filepath.split(os.sep)[-1].split(".")[0].split(MODEL_FILE_SEP)
params = clean_model_filepath[1].split("-")
class_count = int(clean_model_filepath[2])
discretisation = {
"types": clean_model_filepath[3].split("-"),
"split": int(clean_model_filepath[4]),
}
if clean_model_filepath[0] == "charge":
extract_function = get_charges
if clean_model_filepath[0] == "discharge":
extract_function = get_discharges
if clean_model_filepath[0] == "impedance":
extract_function = get_impedance
X_cycles, y_true = load_batteries(extract_function, params, batteries, class_count, discretisation)
try:
loaded_automl = joblib.load(model_filepath)
except:
print("Error in model loading-> " + model_filepath)
print(loaded_automl)
y_pred = loaded_automl.predict(X_cycles, n_jobs=-1)
result = [
"Batteries: " + str(batteries),
"Accuracy score for battery: " + str(sklearn.metrics.accuracy_score(y_true, y_pred)),
str(precision_recall_fscore_support(y_true, y_pred, average="weighted")),
str(confusion_matrix(y_true, y_pred)),
]
return result
def test_models_with_test_battery(folder_to_test=RESULTS_FOLDER):
# get all model inside the folder
for model_filepath in glob.glob(folder_to_test+os.sep+"*.joblib"):
results = test_model_with_test_battery(model_filepath)
for result in results:
print(result)
if __name__ == "__main__":
# print(test_model_with_test_battery("results/004_fixed_labels_1h_charge/charge___voltage_measured-current_measured-temperature_measured-current_charge-voltage_charge___3.joblib"))
execute_auto_ML()
# print(discretize(np.array([1,2,3,4,5,6,7,8,9,10,11,12]), {"types": ["mean", "median", "standard deviation"], "split": 4}))
# test_models_with_test_battery("results/004_fixed_labels_1h_charge/")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment