Source code for streamline.postanalysis.dataset_compare

import os
import time
import logging
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import kruskal, wilcoxon, mannwhitneyu
from streamline.utils.job import Job
from streamline.modeling.utils import ABBREVIATION, COLORS, is_supported_model
from streamline.modeling.utils import SUPPORTED_MODELS
import seaborn as sns

[docs] class CompareJob(Job): """ This 'Job' script is called by which runs non-parametric statistical analysis comparing ML algorithm performance between all target datasets included in the original Phase 1 data folder, for each evaluation metric. Also compares the best overall model for each target dataset, for each evaluation metric. This runs once for the entire pipeline analysis. """ def __init__(self, output_path=None, experiment_name=None, experiment_path=None, algorithms=None, exclude=("XCS", "eLCS"), class_label="Class", instance_label=None, sig_cutoff=0.05, show_plots=False): super().__init__() assert (output_path is not None and experiment_name is not None) or (experiment_path is not None) if output_path is not None and experiment_name is not None: self.output_path = output_path self.experiment_name = experiment_name self.experiment_path = self.output_path + '/' + self.experiment_name else: self.experiment_path = experiment_path self.experiment_name = self.experiment_path.split('/')[-1] self.output_path = self.experiment_path.split('/')[-2] datasets = os.listdir(self.experiment_path) remove_list = ['.DS_Store', 'metadata.pickle', 'metadata.csv', 'algInfo.pickle', 'jobsCompleted', 'logs', 'jobs', 'DatasetComparisons', 'UsefulNotebooks', 'dask_logs', self.experiment_name + '_STREAMLINE_Report.pdf'] for text in remove_list: if text in datasets: datasets.remove(text) # ensures consistent ordering of datasets and assignment of temporary identifier self.datasets = sorted(datasets) dataset_directory_paths = [] for dataset in self.datasets: full_path = self.experiment_path + "/" + dataset dataset_directory_paths.append(full_path) self.dataset_directory_paths = dataset_directory_paths self.class_label = class_label self.instance_label = instance_label self.sig_cutoff = sig_cutoff if algorithms is None: self.algorithms = SUPPORTED_MODELS if exclude is not None: for algorithm in exclude: try: self.algorithms.remove(algorithm) except Exception: Exception("Unknown algorithm in exclude: " + str(algorithm)) else: self.algorithms = list() for algorithm in algorithms: self.algorithms.append(is_supported_model(algorithm)) self.algorithms = sorted(algorithms) self.show_plots = show_plots self.abbrev = dict((k, ABBREVIATION[k]) for k in self.algorithms if k in ABBREVIATION) self.colors = dict((k, COLORS[k]) for k in self.algorithms if k in COLORS) self.metrics = None
[docs] def run(self): self.job_start_time = time.time() # for tracking phase runtime data = pd.read_csv(self.dataset_directory_paths[0] + '/model_evaluation/Summary_performance_mean.csv', sep=',') self.metrics = data.columns.values.tolist()[1:] # Create directory to store dataset statistical comparisons if not os.path.exists(self.experiment_path + '/DatasetComparisons'): os.mkdir(self.experiment_path + '/DatasetComparisons')'Running Statistical Significance Comparisons Between Multiple Datasets...') self.kruscall_wallis() self.mann_whitney_u() self.wilcoxon_rank() global_data = self.best_kruscall_wallis() self.best_mann_whitney_u(global_data) self.best_wilcoxon_rank(global_data)'Generate Boxplots Comparing Dataset Performance...') # Generate boxplots comparing average algorithm performance # (for a given metric) across all dataset comparisons self.data_compare_bp_all() # Generate boxplots comparing a specific algorithm's CV performance ( # for AUC_ROC or AUC_PRC) across all dataset comparisons self.data_compare_bp() # Print phase completion"Phase 7 complete") job_file = open(self.experiment_path + '/jobsCompleted/job_data_compare' + '.txt', 'w') job_file.write('complete') job_file.close()
[docs] def kruscall_wallis(self): """ For each algorithm apply non-parametric Kruskal Wallis one-way ANOVA on ranks. Determines if there is a statistically significant difference in performance between original target datasets across CV runs. Completed for each standard metric separately. """ label = ['Statistic', 'P-Value', 'Sig(*)'] for i in range(1, len(self.datasets) + 1): label.append('Median_D' + str(i)) for algorithm in self.algorithms: kruskal_summary = pd.DataFrame(index=self.metrics, columns=label) for metric in self.metrics: temp_array = [] med_list = [] for dataset_path in self.dataset_directory_paths: filename = dataset_path + '/model_evaluation/' + self.abbrev[algorithm] + '_performance.csv' td = pd.read_csv(filename) temp_array.append(td[metric]) med_list.append(td[metric].median()) try: # Run kruskal Wallis result = kruskal(*temp_array) except Exception: result = ['NA', 1] try:[metric, 'Statistic'] = str(round(result[0], 6)) except TypeError:[metric, 'Statistic'] = 'NA'[metric, 'P-Value'] = str(round(result[1], 6)) if result[1] < self.sig_cutoff:[metric, 'Sig(*)'] = str('*') else:[metric, 'Sig(*)'] = str('') for j in range(len(med_list)):[metric, 'Median_D' + str(j + 1)] = str(round(med_list[j], 6)) # Export analysis summary to .csv file kruskal_summary.to_csv(self.experiment_path + '/DatasetComparisons/KruskalWallis_' + algorithm + '.csv')
[docs] def wilcoxon_rank(self): """ For each algorithm, apply non-parametric Wilcoxon Rank Sum (pairwise comparisons). This tests individual algorithm pairs of original target datasets (for each metric) to determine if there is a statistically significant difference in performance across CV runs. Test statistic will be zero if all scores from one set are larger than the other. """ label = ['Metric', 'Data1', 'Data2', 'Statistic', 'P-Value', 'Sig(*)'] for i in range(1, 3): label.append('Median_Data' + str(i)) for algorithm in self.algorithms: master_list = self.inter_set_fn(wilcoxon, algorithm) # Export test results df = pd.DataFrame(master_list) df.columns = label df.to_csv(self.experiment_path + '/DatasetComparisons/WilcoxonRank_' + algorithm + '.csv', index=False)
[docs] def mann_whitney_u(self): """ For each algorithm, apply non-parametric Mann Whitney U-test (pairwise comparisons). Mann Whitney tests dataset pairs (for each metric) to determine if there is a statistically significant difference in performance across CV runs. Test statistic will be zero if all scores from one set are larger than the other. """ label = ['Metric', 'Data1', 'Data2', 'Statistic', 'P-Value', 'Sig(*)'] for i in range(1, 3): label.append('Median_Data' + str(i)) for algorithm in self.algorithms: # Export test results master_list = self.inter_set_fn(mannwhitneyu, algorithm) df = pd.DataFrame(master_list) df.columns = label df.to_csv(self.experiment_path + '/DatasetComparisons/MannWhitney_' + algorithm + '.csv', index=False)
[docs] def best_kruscall_wallis(self): """ For best performing algorithm on a given metric and dataset, apply non-parametric Kruskal Wallis one-way ANOVA on ranks. Determines if there is a statistically significant difference in performance between original target datasets across CV runs on best algorithm for given metric. """ label = ['Statistic', 'P-Value', 'Sig(*)'] for i in range(1, len(self.datasets) + 1): label.append('Best_Alg_D' + str(i)) label.append('Median_D' + str(i)) kruskal_summary = pd.DataFrame(index=self.metrics, columns=label) global_data = [] for metric in self.metrics: best_list = [] best_data = [] for dataset_path in self.dataset_directory_paths: alg_med = [] alg_data = [] for algorithm in self.algorithms: filename = dataset_path + '/model_evaluation/' + self.abbrev[algorithm] + '_performance.csv' td = pd.read_csv(filename) alg_med.append(td[metric].median()) alg_data.append(td[metric]) # Find the best algorithm for given metric based on average best_med = max(alg_med) best_index = alg_med.index(best_med) best_alg = self.algorithms[best_index] best_data.append(alg_data[best_index]) best_list.append([best_alg, best_med]) global_data.append([best_data, best_list]) try: result = kruskal(*best_data)[metric, 'Statistic'] = str(round(result[0], 6))[metric, 'P-Value'] = str(round(result[1], 6)) if result[1] < self.sig_cutoff:[metric, 'Sig(*)'] = str('*') else:[metric, 'Sig(*)'] = str('') except ValueError:[metric, 'Statistic'] = str(round(np.nan, 6))[metric, 'P-Value'] = str(round(np.nan, 6))[metric, 'Sig(*)'] = str('') for j in range(len(best_list)):[metric, 'Best_Alg_D' + str(j + 1)] = str(best_list[j][0])[metric, 'Median_D' + str(j + 1)] = str(round(best_list[j][1], 6)) # Export analysis summary to .csv file kruskal_summary.to_csv(self.experiment_path + '/DatasetComparisons/BestCompare_KruskalWallis.csv') return global_data
[docs] def best_mann_whitney_u(self, global_data): """ For best performing algorithm on a given metric and dataset, apply non-parametric Mann Whitney U-test (pairwise comparisons). Mann Whitney tests dataset pairs (for each metric) to determine if there is a statistically significant difference in performance across CV runs. Test statistic will be zero if all scores from one set are larger than the other. """ df = self.inter_set_best_fn(mannwhitneyu, global_data) df.to_csv(self.experiment_path + '/DatasetComparisons/BestCompare_MannWhitney.csv', index=False)
[docs] def best_wilcoxon_rank(self, global_data): """ For best performing algorithm on a given metric and dataset, apply non-parametric Mann Whitney U-test (pairwise comparisons). Mann Whitney tests dataset pairs (for each metric) to determine if there is a statistically significant difference in performance across CV runs. Test statistic will be zero if all scores from one set are larger than the other. """ df = self.inter_set_best_fn(wilcoxon, global_data) df.to_csv(self.experiment_path + '/DatasetComparisons/BestCompare_WilcoxonRank.csv', index=False)
[docs] def data_compare_bp_all(self): """ Generate a boxplot comparing algorithm performance (CV average of each target metric) across all target datasets to be compared. """ if not os.path.exists(self.experiment_path + '/DatasetComparisons/dataCompBoxplots'): os.mkdir(self.experiment_path + '/DatasetComparisons/dataCompBoxplots') # One boxplot generated for each available metric for metric in self.metrics: df = pd.DataFrame() data_name_list = [] alg_values_dict = {} # Dictionary of all algorithms run that will each have a list of respective mean metric value for algorithm in self.algorithms: # Used to generate algorithm lines on top of boxplot alg_values_dict[algorithm] = [] # For each target dataset for each in self.dataset_directory_paths: data_name_list.append(each.split('/')[-1]) data = pd.read_csv(each + '/model_evaluation/Summary_performance_mean.csv', sep=',', index_col=0) rownames = data.index.values # makes a list of algorithm names from file rownames = list(rownames) # Grab data in metric column col = data[metric] # Dataframe of average target metric values for each algorithm col_list = data[metric].tolist() # List of average target metric values for each algorithm for j in range(len(rownames)): # For each algorithm alg_values_dict[rownames[j]].append(col_list[j]) # Create dataframe of average target metric where columns are datasets, and rows are algorithms df = pd.concat([df, col], axis=1) df.columns = data_name_list # Generate boxplot (with legend for each box) --------------------------------------- # Plot boxplots df.boxplot(column=data_name_list, rot=90) # Plot lines for each algorithm (to illustrate algorithm performance trajectories between datasets) for i in range(len(self.algorithms)): plt.plot(np.arange(len(self.dataset_directory_paths)) + 1, alg_values_dict[self.algorithms[i]], color=self.colors[self.algorithms[i]], label=self.algorithms[i]) # Specify plot labels plt.ylabel(str(metric)) plt.xlabel('Dataset') plt.legend(loc="upper left", bbox_to_anchor=(1.01, 1)) # Export and/or show plot plt.savefig( self.experiment_path + '/DatasetComparisons/dataCompBoxplots/DataCompareAllModels_' + metric + '.png', bbox_inches="tight") if self.show_plots: else: plt.close('all')
# plt.cla() # not required
[docs] def data_compare_bp(self): """ Generate a boxplot comparing average algorithm performance (for a given target metric) across all target datasets to be compared. """ metric_list = ['ROC AUC', 'PRC AUC'] # Hard coded if not os.path.exists(self.experiment_path + '/DatasetComparisons/dataCompBoxplots'): os.mkdir(self.experiment_path + '/DatasetComparisons/dataCompBoxplots') for algorithm in self.algorithms: for metric in metric_list: df = pd.DataFrame() data_name_list = [] for each in self.dataset_directory_paths: data_name_list.append(each.split('/')[-1]) data = pd.read_csv(each + '/model_evaluation/' + self.abbrev[algorithm] + '_performance.csv', sep=',') # Grab data in metric column col = data[metric] df = pd.concat([df, col], axis=1) df.columns = data_name_list # Generate boxplot (with legend for each box) df.boxplot(column=data_name_list, rot=90) # Specify plot labels plt.ylabel(str(metric)) plt.xlabel('Dataset') plt.title(algorithm) # Export and/or show plot plt.savefig(self.experiment_path + '/DatasetComparisons/dataCompBoxplots/DataCompare_' + self.abbrev[ algorithm] + '_' + metric + '.png', bbox_inches="tight") if self.show_plots: else: plt.close('all')
# plt.cla() # not required
[docs] def save_runtime(self): """ Save phase runtime """ runtime_file = open(self.experiment_path + '/runtime/runtime_compare.txt', 'w') runtime_file.write(str(time.time() - self.job_start_time)) runtime_file.close()
[docs] def inter_set_fn(self, fn, algorithm): master_list = list() for metric in self.metrics: for x in range(0, len(self.dataset_directory_paths) - 1): for y in range(x + 1, len(self.dataset_directory_paths)): # Grab info on first dataset file1 = self.dataset_directory_paths[x] + '/model_evaluation/' + self.abbrev[ algorithm] + '_performance.csv' td1 = pd.read_csv(file1) set1 = td1[metric] med1 = td1[metric].median() # Grab info on second dataset file2 = self.dataset_directory_paths[y] + '/model_evaluation/' + self.abbrev[ algorithm] + '_performance.csv' td2 = pd.read_csv(file2) set2 = td2[metric] med2 = td2[metric].median() temp_list = self.temp_summary(set1, set2, x, y, metric, fn) temp_list.append(str(round(med1, 6))) temp_list.append(str(round(med2, 6))) master_list.append(temp_list) return master_list
[docs] def inter_set_best_fn(self, fn, global_data): label = ['Metric', 'Data1', 'Data2', 'Statistic', 'P-Value', 'Sig(*)'] for i in range(1, 3): label.append('Best_Alg_Data' + str(i)) label.append('Median_Data' + str(i)) master_list = list() for j in range(len(self.metrics)): metric = self.metrics[j] for x in range(0, len(self.datasets) - 1): for y in range(x + 1, len(self.datasets)): set1 = global_data[j][0][x] med1 = global_data[j][1][x][1] set2 = global_data[j][0][y] med2 = global_data[j][1][y][1] temp_list = self.temp_summary(set1, set2, x, y, metric, fn) temp_list.append(global_data[j][1][x][0]) temp_list.append(str(round(med1, 6))) temp_list.append(global_data[j][1][y][0]) temp_list.append(str(round(med2, 6))) master_list.append(temp_list) # Export analysis summary to .csv file df = pd.DataFrame(master_list) df.columns = label return df
[docs] def temp_summary(self, set1, set2, x, y, metric, fn): temp_list = list() # handle error when metric values are equal for both algorithms if set1.equals(set2): # Check if all nums are equal in sets result = ['NA', 1] else: try: result = fn(set1, set2) except Exception: result = ['NA_error', 1] # Summarize test information in list temp_list.append(str(metric)) temp_list.append('D' + str(x + 1)) temp_list.append('D' + str(y + 1)) if set1.equals(set2): temp_list.append(result[0]) else: try: temp_list.append(str(round(result[0], 6))) except Exception: temp_list.append(result[0]) temp_list.append(str(round(result[1], 6))) if result[1] < self.sig_cutoff: temp_list.append(str('*')) else: temp_list.append(str('')) return temp_list