Source code for pyridge.experiment.check

from pyridge.util import metric_dict, prepare_data
from pyridge.util.cross import get_best_param, product_dict
from pyridge import algorithm_dict
import numpy as np
import logging
import os
from time import perf_counter


logger = logging.getLogger('pyridge')


[docs]def check_algorithm(folder, dataset, algorithm, metric_list, hyperparameter, metric_cross=None, classification=True, autoencoder=False): """ Testing easily a predictor along all the folds. :param str folder: :param str dataset: :param str algorithm: :param list metric_list: :param str metric_cross: :param dict hyperparameter: :param bool classification: True if we want a classification; False if we are looking for a regression. :param bool autoencoder: True if we want autoencoder test; False if we are looking for a classic supervised test. :return: a dictionary, with the metrics. """ logger.info('Starting algorithm %s', algorithm) if autoencoder is True: classification = False if metric_cross is None: if 'accuracy' in metric_list and classification is True: metric_cross = 'accuracy' else: metric_cross = 'rmse' # Data folder_dataset = os.path.join(folder, dataset) datasets = [('train_' + dataset + '.' + str(i), 'test_' + dataset + '.' + str(i)) for i in range(10)] value_dict = {m: np.empty(10) for m in metric_list} # Time value_dict['cross_time'] = np.empty(10) value_dict['train_time'] = np.empty(10) value_dict['test_time'] = np.empty(10) value_dict['sum_time'] = np.empty(10) fold = 0 start = perf_counter() for train_dataset, test_dataset in datasets: value_dict_fold = check_fold(folder_dataset=folder_dataset, train_dataset=train_dataset, test_dataset=test_dataset, algorithm=algorithm, metric_list=metric_list, metric_cross=metric_cross, hyperparameter=hyperparameter, classification=classification, autoencoder=autoencoder) for m, v in value_dict_fold.items(): value_dict[m][fold] = v fold += 1 end = perf_counter() sum_time = value_dict['sum_time'].copy() for m in value_dict.keys(): value_dict[m] = np.mean(value_dict[m]) if 'time' not in m: logger.info('{} for algorithm {} and ' 'dataset {} is {}'.format(m, algorithm, dataset, value_dict[m])) value_dict['time_std'] = np.std(sum_time) value_dict['time'] = end - start logger.info('Elapsed time is %f', value_dict['time']) return value_dict
[docs]def check_fold(folder_dataset='data/iris', train_dataset='train_iris.0', test_dataset='test_iris.0', sep='\s+', algorithm='ELM', metric_list=['accuracy', 'rmse'], metric_cross='accuracy', hyperparameter=None, classification=True, autoencoder=False): """ Generic test. :param str folder_dataset: :param str train_dataset: :param str test_dataset: :param str sep: :param str algorithm: :param list metric_list: :param dict hyperparameter: :param str metric_cross: :param bool classification: True if we want a classification; False if we are looking for a regression. :param bool autoencoder: True if we want autoencoder test; False if we are looking for a classic supervised test. :return: a dictionary, with the metrics. """ # Data train_data, train_target, scaler = \ prepare_data(folder=folder_dataset, dataset=train_dataset, sep=sep) test_data, test_target, _ = prepare_data(folder=folder_dataset, dataset=test_dataset, sep=sep, scaler=scaler) if autoencoder is True: train_target = train_data test_target = train_target # Algorithm start = perf_counter() clf = algorithm_dict[algorithm](classification) if hyperparameter is None: best_param = None end_cross = start clf.fit(train_data=train_data, train_target=train_target) else: best_param = get_best_param(predictor=clf, train_data=train_data, train_target=train_target, hyperparameter=hyperparameter, metric=metric_cross) end_cross = perf_counter() clf = algorithm_dict[algorithm](classification) clf.fit(train_data=train_data, train_target=train_target, parameter=best_param) end_train = perf_counter() # Metric test_time = 0.0 value_dict = dict() for m in metric_list: metric_fun = metric_dict[m] start_test = perf_counter() v = metric_fun(clf=clf, pred_data=test_data, real_targ=test_target) test_time += perf_counter() - start_test value_dict.update({m: v}) logger.debug('{} in dataset {} ' 'is {}'.format(m, test_dataset, v)) value_dict['test_time'] = test_time cross_time = end_cross - start value_dict['cross_time'] = cross_time train_time = end_train - end_cross value_dict['train_time'] = train_time value_dict['sum_time'] = cross_time + train_time + test_time return value_dict
def check_hyperparameter_sensitivity(folder, dataset, algorithm, metric, hyp_range_dict, fixed_hyp, repetitions: int = 5, classification: bool = True): """ Function to check hyperparameter sensitivity. :param str folder: :param str dataset: :param str algorithm: :param dict hyp_range_dict: :param dict fixed_hyp: :param int repetitions: :param bool classification: :return: A dataframe with the value of the hyperparameters and the chosen metric. """ # Load dataset folder_dataset = os.path.join(folder, dataset) k_datasets = [('train_' + dataset + '.' + str(i), 'test_' + dataset + '.' + str(i)) for i in range(10)] final_length = 1 for parameter, value_list in hyp_range_dict.items(): final_length = final_length * len(value_list) logger.debug('Hyperparameter sensitivity test is going ' 'to check {} combinations'.format(final_length)) # Prepare result dict result_dict = {metric: np.empty(final_length)} for parameter in list(hyp_range_dict.keys()): result_dict.update({parameter: np.empty(final_length)}) count = 0 logger.debug('Fixed parameters: %s', fixed_hyp) for hyp_combination in product_dict(**hyp_range_dict): logger.debug('Trying with parameter: %s, combination %i', hyp_combination, count) # Load algorithm fixed_hyp.update(hyp_combination) predictor = algorithm_dict[algorithm](classification) # K fold over repetitions metric_value = np.empty(10 * repetitions) i = 0 for train_k_dataset, test_k_dataset in k_datasets: logger.debug('Fold %s', train_k_dataset.split('.')[-1]) # Load data train_k_data, train_k_target, scaler = \ prepare_data(folder=folder_dataset, dataset=train_k_dataset) test_k_data, test_k_target, _ = \ prepare_data(folder=folder_dataset, dataset=test_k_dataset, scaler=scaler) for r in range(repetitions): logger.debug('Repetition %i', r) # TRAIN predictor.fit(train_data=train_k_data, train_target=train_k_target, parameter=fixed_hyp) # TEST metric_value[i] = metric_dict[metric](clf=predictor, pred_data=test_k_data, real_targ=test_k_target) i += 1 # Updating dict result_dict[metric][count] = metric_value.mean() for parameter, value in hyp_combination.items(): result_dict[parameter][count] = value count += 1 return result_dict def create_logger(level='INFO', name='pyridge'): """ Return a logger with time format. :param str level: DEBUG, INFO, WARNING, ERROR, CRITICAL. :param str name: name of the logger. :return: logger instanced. """ # create logger logger = logging.getLogger(name) logger.setLevel(level) # create console handler and set level to debug ch = logging.StreamHandler() # create formatter formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') # add formatter to ch ch.setFormatter(formatter) # add ch to logger logger.addHandler(ch) return logger