WIP: Explore the automl

7707720e · Etienne Frank · 5e5a3646 · 7707720e · 7707720e · 7707720e
Verified Commit 7707720e authored 6 years ago by Etienne Frank
--- a/.gitignore
+++ b/.gitignore
 output_graphs/
+batteries_data_temp/
+results/
+current_results/
 # Byte-compiled / optimized / DLL files

--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
-before_script:
+# before_script:
-  - pipenv install
+#   - pkill -f apt
+#   - apt install -y python-dev
-lint:
+#   - apt install -y python3-dev
-  tags:
+#   - pipenv install
-      - ubuntu-docker
+# 
-  script:
+# lint:
-    - pipenv run flake8
+#   tags:
+#     - ubuntu-docker
+#   script:
+#     - pipenv run flake8
--- a/Pipfile
+++ b/Pipfile
@@ -10,6 +10,12 @@ matplotlib = "*"
 pillow = "*"
 "flake8" = "*"
 pylint = "*"
+cython = "*"
+pandas = "*"
+pytest = "*"
+openpyxl = "*"
+"auto-sklearn2" = {ref = "8bdcba15caa28cb4336d9cb6ee4108078ab6d8a2", git = "git://github.com/automl/auto-sklearn.git"}
+auto-sklearn = "*"
 [dev-packages]

--- a/Pipfile.lock
+++ b/Pipfile.lock
--- a/explore_auto_scikit.py
+++ b/explore_auto_scikit.py
+import shutil
+import glob
+import pickle
+import os
+import traceback
+import time
+import autosklearn.classification
+import sys
+from extraction.retrieve_battery_data import extract_charge_discharge_impedance
+from functools import reduce
+import itertools
+import numpy as np
+import sklearn.model_selection
+import sklearn.datasets
+import sklearn.metrics
+import multiprocessing
+from concurrent.futures import ThreadPoolExecutor
+from autosklearn.constants import *
+from autosklearn.classification import AutoSklearnClassifier
+from autosklearn.metrics import accuracy
+from sklearn.metrics import precision_recall_fscore_support
+from sklearn.metrics import f1_score
+from sklearn.externals import joblib
+from sklearn.metrics import confusion_matrix
+import pandas as pd
+# import concurrent.futures
+# np.set_printoptions(threshold=np.nan)
+SEPARATOR = "-----------------------------------"
+RESULTS_FOLDER = "current_results"
+MODEL_FILE_SEP = "___"
+def chunk_it(seq, num):
+    avg = len(seq) / float(num)
+    last = 0
+    out = []
+    while last <= num:
+        res = [last] * (len(seq[int(last*avg):int((last + 1) * avg)]))
+        out += res
+        last += 1
+    return out
+# DISCRETISATION_TYPES = ["mean", "median", "standard-deviation"]
+DISCRETISATION_TYPES = [
+    "mean", "gradiant_mean",
+    "median", "gradiant_median",
+    "standard-deviation", "gradiant_std",
+]
+BATTERY_BASE = "./data/BatteryAgingARC_25-44/"
+TEST_BATTERIES = [BATTERY_BASE + "B0025.mat"]
+ALL_BATTERIES = [
+    # BATTERY_BASE + "B0025.mat",
+    BATTERY_BASE + "B0026.mat",
+    BATTERY_BASE + "B0027.mat",
+    BATTERY_BASE + "B0028.mat",
+    # BATTERY_BASE + "B0033.mat",
+    # BATTERY_BASE + "B0034.mat",
+]
+charges_params = [
+    "voltage_measured",
+    "current_measured",
+    "temperature_measured",
+    "current_charge",
+    "voltage_charge",
+]
+discharges_params = charges_params + ["capacity"] # capacity not same length
+impedance_params = [
+    # "re", # float_
+    # "rct", # float_
+    "sense_current", # Complexes
+    "battery_current", # Complexes
+    "current_ratio", # Complexes
+    "battery_impedance", # Complexes but multiple array of 1 complexe
+    # "rectified_impedance", # Complexes but multiple array of 1 complexe # not same length
+]
+def discretize(array, discretisation):
+    # array split should copy when split > lenght instead of empty arrays
+    splitted = np.array_split(array, discretisation["split"])
+    out = []
+    if "mean" in discretisation["types"]:
+        out.append([np.mean(split) for split in splitted])
+    if "median" in discretisation["types"]:
+        out.append([np.median(split) for split in splitted])
+    if "standard deviation" in discretisation["types"]:
+        out.append([np.std(split) for split in splitted])
+    splitted_gradient = np.array_split(np.gradient(array), discretisation["split"])
+    if "gradiant_mean" in discretisation["types"]:
+        out.append([np.mean(split) for split in splitted_gradient])
+    if "gradiant_median" in discretisation["types"]:
+        out.append([np.median(split) for split in splitted_gradient])
+    if "gradiant_std" in discretisation["types"]:
+        out.append([np.std(split) for split in splitted_gradient])
+    return out
+    # out = np.array(out)
+    # return out.T
+def append_params(result_cycles, cycles, used_parameters, discretisation):
+    """One line contain 1 moment of every feature"""
+    total_cycles_length = 0
+    for cycle in cycles:
+        # # Raw
+        # cycle_length = len(cycle[used_parameters[0]])
+        # total_cycles_length += cycle_length
+        # for i in range(cycle_length):
+        #     time_slice = []
+        #     for param in used_parameters:
+        #         time_slice.append(np.nan_to_num(cycle[param][i]))
+        #     result_cycles.append(time_slice)
+        # Discretisation
+        cycle_length = len(cycle[used_parameters[0]])
+        if cycle_length < discretisation["split"]:
+            raise "Discretisation split is too big in comparison with cycle length"
+        discretized = []
+        for param in used_parameters:
+            np.nan_to_num(cycle[param], copy=False)
+            mean_median_std = discretize(cycle[param], discretisation)
+            discretized += mean_median_std
+        transposed_discretized = np.array(discretized).T
+        for time_slice in transposed_discretized:
+            result_cycles.append(time_slice)
+        total_cycles_length += len(transposed_discretized)
+        # CycleWay
+        # reduce all features together but before, each feature length N should be reduced to a fixed length
+        # result_cycles.append(np.nan_to_num(np.array(
+        #     reduce((lambda acc, param: acc + cycle[param].tolist()), used_parameters, [])
+        # )))
+    return total_cycles_length
+def combinations(array):
+    return list(itertools.chain(*[itertools.combinations(array,i+1) for i,_ in enumerate(array)]))
+def get_charges(filepath):
+    charge_cycles, discharge_cycles, impedance_cycles = extract_charge_discharge_impedance(filepath)
+    return charge_cycles
+def get_discharges(filepath):
+    charge_cycles, discharge_cycles, impedance_cycles = extract_charge_discharge_impedance(filepath)
+    return discharge_cycles
+def get_impedance(filepath):
+    charge_cycles, discharge_cycles, impedance_cycles = extract_charge_discharge_impedance(filepath)
+    return impedance_cycles
+def load_batteries(extract_function, params, all_batteries=ALL_BATTERIES, class_count=-3, discretisation=None):
+    all_batteries_number = [bat.split(os.sep)[-1].split(".")[0] for bat in all_batteries]
+    # cycles_filename = "data__" + "-".join(params) + "__" + "-".join(all_batteries_number) + "__" + str(class_count) + ".p"
+    cycles_filename = "__".join([
+        "batteries_data_temp/data",
+        "-".join(params),
+        "-".join(all_batteries_number),
+        str(class_count),
+        "-".join(discretisation["types"]),
+        str(discretisation["split"]),
+    ]) + ".p"
+    X_cycles = []
+    y_cycles = []
+    if os.path.exists(cycles_filename):
+        loaded = pickle.load( open(cycles_filename, "rb") )
+        return loaded["x"], loaded["y"]
+    else:
+        for filepath in all_batteries:
+            cycles = extract_function(filepath)
+            # charge_cycles, discharge_cycles, impedance_cycles = extract_charge_discharge_impedance(filepath)
+            total_cycles_length = append_params(X_cycles, cycles, params, discretisation)
+            # add labels
+            y_cycles += chunk_it([-1]*total_cycles_length, class_count) # cycle way
+        # X_cycles = np.zeros([len(X_cycles),len(max(X_cycles,key = lambda x: len(x)))]) # padding #CycleWay
+        X_cycles = np.array(X_cycles)
+        y_cycles = np.array(y_cycles)
+        pickle.dump( {"x": X_cycles, "y": y_cycles} , open( cycles_filename, "wb" ) )
+        return X_cycles, y_cycles
+def save_dataframe(cv_results, model_file_name):
+    df = pd.DataFrame(cv_results)
+    writer = pd.ExcelWriter(model_file_name + ".xlsx")
+    df.to_excel(writer, "automl_dataframe")
+    writer.save()
+def auto_ML(options):
+    title, params, extract_function, class_count, discretisation, folds = options
+    print("--------------------------------------------------------------------------------")
+    print(title)
+    print(params)
+    print("--------------------------------------------------------------------------------")
+    X_cycles, y_cycles = load_batteries(extract_function, params, ALL_BATTERIES, class_count, discretisation)
+    # y_cycles = chunk_it(X_cycles, class_count) can be removed
+    # y_cycles = np.array(y_cycles) can be removed
+    print(X_cycles.shape)
+    print(y_cycles.shape)
+    # before split and are OK
+    #  40'000 inputs and 6 classes CV !not certain!
+    # 930'000 inputs and 6 classes holdout !not certain!
+    #  25'860 inputs and 3 classes CV
+    MAX_SIZE = 258600000
+    X_train, X_test, y_train, y_test = \
+        sklearn.model_selection.train_test_split(X_cycles, y_cycles, random_state=1)
+    X_train = X_train[:MAX_SIZE]
+    y_train = y_train[:MAX_SIZE]
+    # 0.3 is quite good
+    # k = 0.3
+    k = 0.5
+    automl = autosklearn.classification.AutoSklearnClassifier(
+        # time_left_for_this_task=69,
+        # per_run_time_limit=35,
+        time_left_for_this_task=int(3600*k),
+        per_run_time_limit=int(360*k),
+        ensemble_size=int(50),
+        ensemble_nbest=int(200),
+        # ml_memory_limit=1024,
+        # shared_mode=True,
+        # ensemble_size=50,
+        # ensemble_nbest=200,
+        # tmp_folder=tmp_folder,
+        # TODO Use CV instead of HOLDOUT
+        # resampling_strategy='holdout',
+        resampling_strategy='cv',
+        resampling_strategy_arguments={'folds': folds},
+        # output_folder=output_folder,
+        # initial_configurations_via_metalearning=0,
+        # seed=SEED,
+        # time_left_for_this_task=3600*18,
+        # per_run_time_limit=360*18,
+        ml_memory_limit=28024,
+        # delete_tmp_folder_after_terminate=False,
+        # delete_output_folder_after_terminate=False,
+    )
+    automl.fit(X_train.copy(), y_train.copy())
+    if automl.resampling_strategy == "cv":
+        automl.refit(X_train.copy(), y_train.copy())
+    y_pred = automl.predict(X_test, n_jobs=-1)
+    model_file_name = os.path.join(RESULTS_FOLDER, MODEL_FILE_SEP.join([
+        title,
+        "-".join(params),
+        str(class_count),
+        "-".join(discretisation["types"]),
+        str(discretisation["split"]),
+        "folds" + str(folds),
+    ]) + ".joblib")
+    print(model_file_name)
+    save_dataframe(automl.cv_results_, model_file_name[:-7])
+    joblib.dump(automl, model_file_name)
+    test_battery_result = test_model_with_test_battery(model_file_name)
+    results = [
+        title,
+        "Params: " + str(params),
+        "Discretisation: " + str(discretisation),
+        "Class count: " + str(class_count),
+        "Folds: " + str(folds),
+        "Accuracy score: " + str(sklearn.metrics.accuracy_score(y_test, y_pred)),
+        "F1 score: " + str(precision_recall_fscore_support(y_test, y_pred, average="weighted")),
+        str(confusion_matrix(y_test, y_pred)),
+    ] + test_battery_result + [
+        str(automl.sprint_statistics()),
+        str(automl.show_models()),
+        # str(automl.cv_results_),
+    ]
+    print(results)
+    return results
+def get_options(min_length=0):
+    class_count = 3
+    # 10, 50, 100, 200
+    charge_options = []
+    discharge_options = []
+    impedance_options = []
+    # charge_options = [("charge", param, get_charges, class_count) for param in combinations(charges_params) if len(param) >= min_length]
+    # charge_options = [("charge", charges_params, get_charges, class_count, {"types": discre_type, "split":100}) for discre_type in combinations(DISCRETISATION_TYPES)]
+    # discharge_options = [("discharge", charges_params, get_discharges, class_count, {"types": discre_type, "split":100}) for discre_type in combinations(DISCRETISATION_TYPES)]
+    # discharge_options = [("discharge", param, get_discharges, class_count) for param in combinations(charges_params) if len(param) >= min_length]
+    # impedance_options = [("impedance", param, get_impedance, class_count) for param in combinations(impedance_params) if len(param) >= min_length]
+    # charge_options = [("charge", charges_params, get_charges, class_count, {"types": DISCRETISATION_TYPES, "split":split}) for split in range(1,11)]
+    charge_options = [
+        # ("charge", charges_params, get_charges, class_count, {"types": DISCRETISATION_TYPES, "split":10}, 10),
+        ("charge", charges_params, get_charges, class_count, {"types": DISCRETISATION_TYPES, "split":6}, 5),
+        ("charge", charges_params, get_charges, class_count, {"types": DISCRETISATION_TYPES, "split":6}, 10),
+        ("charge", charges_params, get_charges, class_count, {"types": DISCRETISATION_TYPES, "split":6}, 15),
+        ("charge", charges_params, get_charges, class_count, {"types": DISCRETISATION_TYPES, "split":6}, 20),
+        # ("charge", charges_params, get_charges, class_count, {"types": DISCRETISATION_TYPES, "split":50}, 10),
+        # ("charge", charges_params, get_charges, class_count, {"types": DISCRETISATION_TYPES, "split":100}, 10),
+        # # ("charge", charges_params, get_charges, class_count, {"types": DISCRETISATION_TYPES, "split":1}, 10),
+        # ("charge", charges_params, get_charges, class_count, {"types": DISCRETISATION_TYPES, "split":200}, 10),
+    ]
+    # discharge_options = [("discharge", charges_params, get_discharges, class_count, {"types": DISCRETISATION_TYPES, "split":100})]
+    # impedance_options = [("impedance", impedance_params, get_impedance, class_count, {"types": DISCRETISATION_TYPES, "split":20})]
+    all_options = discharge_options + impedance_options + charge_options
+    return all_options
+def writeln(file, text):
+    file.write(text + "\n")
+def init_folder():
+    if os.path.exists(RESULTS_FOLDER):
+        shutil.rmtree(RESULTS_FOLDER)
+    if not os.path.exists(RESULTS_FOLDER):
+        os.makedirs(RESULTS_FOLDER)
+    results_file = os.path.join(RESULTS_FOLDER, "result.txt")
+    failed_file = os.path.join(RESULTS_FOLDER, "failed.txt")
+    open(results_file, 'w').close()
+    open(failed_file, 'w').close()
+    return results_file, failed_file
+def execute_auto_ML():
+    start_time = time.time()
+    print(start_time)
+    all_options = get_options(min_length=0)
+    results_file, failed_file = init_folder()
+    # results = list(map(auto_ML, all_options))
+    for option in all_options:
+        try:
+            result = auto_ML(option)
+            with open(results_file, 'a') as the_file:
+                writeln(the_file, SEPARATOR)
+                for line in result:
+                    writeln(the_file, str(line))
+                writeln(the_file, SEPARATOR)
+        except:
+            with open(failed_file, 'a') as the_file:
+                writeln(the_file, SEPARATOR)
+                writeln(the_file, str(option))
+                writeln(the_file, str(traceback.format_exc()))
+                writeln(the_file, SEPARATOR)
+    seconds = time.time() - start_time
+    print("execution time", int(seconds / 60 / 60), "h", int(seconds / 60) % 60, "m", int(seconds) % 60, "s")
+def test_model_with_test_battery(model_filepath):
+    batteries = TEST_BATTERIES
+    clean_model_filepath = model_filepath.split(os.sep)[-1].split(".")[0].split(MODEL_FILE_SEP)
+    params = clean_model_filepath[1].split("-")
+    class_count = int(clean_model_filepath[2])
+    discretisation = {
+        "types": clean_model_filepath[3].split("-"),
+        "split": int(clean_model_filepath[4]),
+    }
+    if clean_model_filepath[0] == "charge":
+        extract_function = get_charges
+    if clean_model_filepath[0] == "discharge":
+        extract_function = get_discharges
+    if clean_model_filepath[0] == "impedance":
+        extract_function = get_impedance
+    X_cycles, y_true = load_batteries(extract_function, params, batteries, class_count, discretisation)
+    try:
+        loaded_automl = joblib.load(model_filepath)
+    except:
+        print("Error in model loading-> " + model_filepath)
+    print(loaded_automl)
+    y_pred = loaded_automl.predict(X_cycles, n_jobs=-1)
+    result = [
+        "Batteries: " + str(batteries),
+        "Accuracy score for battery: " + str(sklearn.metrics.accuracy_score(y_true, y_pred)),
+        str(precision_recall_fscore_support(y_true, y_pred, average="weighted")),
+        str(confusion_matrix(y_true, y_pred)),
+    ]
+    return result
+def test_models_with_test_battery(folder_to_test=RESULTS_FOLDER):
+    # get all model inside the folder
+    for model_filepath in glob.glob(folder_to_test+os.sep+"*.joblib"):
+        results = test_model_with_test_battery(model_filepath)
+        for result in results:
+            print(result)
+if __name__ == "__main__":
+    # print(test_model_with_test_battery("results/004_fixed_labels_1h_charge/charge___voltage_measured-current_measured-temperature_measured-current_charge-voltage_charge___3.joblib"))
+    execute_auto_ML()
+    # print(discretize(np.array([1,2,3,4,5,6,7,8,9,10,11,12]), {"types": ["mean", "median", "standard deviation"], "split": 4}))
+    # test_models_with_test_battery("results/004_fixed_labels_1h_charge/")