import csv
import glob
import logging
import os
import pickle
from pathlib import Path
import pandas as pd
import numpy as np
from streamline.dataprep.data_process import DataProcess
from streamline.modeling.basemodel import BaseModel
from streamline.modeling.utils import ABBREVIATION, SUPPORTED_MODELS, is_supported_model
from streamline.postanalysis.statistics import StatsJob
from streamline.utils.dataset import Dataset
from streamline.utils.job import Job
# Evaluation metrics
# from scipy import interp,stats
[docs]
class ReplicateJob(Job):
"""
This 'Job' script conducts exploratory analysis on the new replication dataset then
applies and evaluates all trained models on one or more previously unseen hold-out
or replication study dataset(s). It also generates new evaluation figure.
It does not deal with model feature importance estimation as this is a part of model training interpretation only.
This script is run once for each replication dataset in rep_data_path.
"""
def __init__(self, dataset_filename, dataset_for_rep, full_path, class_label, instance_label, match_label,
ignore_features=None, algorithms=None, exclude=("XCS", "eLCS"), cv_partitions=3, exclude_plots=None,
categorical_cutoff=10, sig_cutoff=0.05, scale_data=True, impute_data=True,
multi_impute=True, show_plots=False, scoring_metric='balanced_accuracy', random_state=None):
super().__init__()
self.dataset_filename = dataset_filename
self.dataset_for_rep = dataset_for_rep
self.full_path = full_path
self.class_label = class_label
self.instance_label = instance_label
self.match_label = match_label
if algorithms is None:
self.algorithms = SUPPORTED_MODELS
if exclude is not None:
for algorithm in exclude:
try:
self.algorithms.remove(algorithm)
except Exception:
Exception("Unknown algorithm in exclude: " + str(algorithm))
else:
self.algorithms = list()
for algorithm in algorithms:
self.algorithms.append(is_supported_model(algorithm))
known_exclude_options = ['plot_ROC', 'plot_PRC', 'plot_metric_boxplots', 'feature_correlations']
if exclude_plots is not None:
for x in exclude_plots:
if x not in known_exclude_options:
logging.warning("Unknown exclusion option " + str(x))
else:
exclude_plots = list()
self.plot_roc = 'plot_ROC' not in exclude_plots
self.plot_prc = 'plot_PRC' not in exclude_plots
self.plot_metric_boxplots = 'plot_metric_boxplots' not in exclude_plots
self.exclude_plots = exclude_plots
self.export_feature_correlations = 'feature_correlations' not in exclude_plots
self.show_plots = show_plots
self.cv_partitions = cv_partitions
self.categorical_cutoff = categorical_cutoff
self.sig_cutoff = sig_cutoff
self.scale_data = scale_data
self.impute_data = impute_data
self.scoring_metric = scoring_metric
self.multi_impute = multi_impute
self.ignore_features = ignore_features
self.random_state = random_state
self.train_name = self.full_path.split('/')[-1]
self.experiment_path = '/'.join(self.full_path.split('/')[:-1])
# replication dataset being analyzed in this job
self.apply_name = self.dataset_filename.split('/')[-1].split('.')[0]
[docs]
def run(self):
# Load Replication Dataset
rep_data = Dataset(self.dataset_filename, self.class_label, self.match_label, self.instance_label)
rep_feature_list = list(rep_data.data.columns.values)
rep_feature_list.remove(self.class_label)
if self.match_label is not None:
rep_feature_list.remove(self.match_label)
if self.instance_label is not None:
rep_feature_list.remove(self.instance_label)
# Load original training dataset (could include 'match label')
# replication dataset file extension
train_data = Dataset(self.dataset_for_rep, self.class_label, self.match_label, self.instance_label)
# train_data.clean_data(ignore_features=self.ignore_features)
all_train_feature_list = list(train_data.data.columns.values)
all_train_feature_list.remove(self.class_label)
if self.match_label is not None:
all_train_feature_list.remove(self.match_label)
if self.instance_label is not None:
all_train_feature_list.remove(self.instance_label)
# Confirm that all features in original training data appear in replication datasets
if not (set(all_train_feature_list).issubset(set(rep_feature_list))):
raise Exception('Error: One or more features in training dataset did not appear in replication dataset!')
# Grab and order replication data columns to match training data columns
rep_data.data = rep_data.data[train_data.data.columns]
# Create Folder hierarchy
if not os.path.exists(self.full_path + "/replication/" + self.apply_name + '/' + 'exploratory'):
os.mkdir(self.full_path + "/replication/" + self.apply_name + '/' + 'exploratory')
if not os.path.exists(
self.full_path + "/replication/" + self.apply_name + '/' + 'exploratory' + '/' + 'initial'):
os.mkdir(self.full_path + "/replication/" + self.apply_name + '/' + 'exploratory' + '/' + 'initial')
if not os.path.exists(self.full_path + "/replication/" + self.apply_name + '/' + 'model_evaluation'):
os.mkdir(self.full_path + "/replication/" + self.apply_name + '/' + 'model_evaluation')
if not os.path.exists(
self.full_path + "/replication/" + self.apply_name + '/' + 'model_evaluation' + '/' + 'pickled_metrics'):
os.mkdir(
self.full_path + "/replication/" + self.apply_name + '/' + 'model_evaluation' + '/' + 'pickled_metrics')
# Load previously identified list of categorical
# variables and create an index list to identify respective columns
file = open(self.full_path + '/exploratory/initial/initial_categorical_features.pickle', 'rb')
categorical_variables = pickle.load(file)
file = open(self.full_path + '/exploratory/initial/initial_quantitative_features.pickle', 'rb')
quantitative_variables = pickle.load(file)
rep_data.categorical_variables = categorical_variables
rep_data.quantitative_variables = quantitative_variables
eda = DataProcess(rep_data, self.full_path, ignore_features=self.ignore_features,
categorical_features=categorical_variables, quantitative_features=quantitative_variables,
exclude_eda_output=None,
categorical_cutoff=self.categorical_cutoff, sig_cutoff=self.sig_cutoff,
random_state=self.random_state, show_plots=self.show_plots)
# Arguments changed to send to correct locations describe_data(self)
eda.dataset.name = 'replication/' + self.apply_name
eda.identify_feature_types()
transition_df = pd.DataFrame(columns=['Instances', 'Total Features',
'Categorical Features',
'Quantitative Features', 'Missing Values',
'Missing Percent', 'Class 0', 'Class 1'])
transition_df.loc["Original"] = eda.counts_summary(save=False)
with open(self.experiment_path + '/' + self.train_name +
'/exploratory/binary_categorical_dict.pickle', 'rb') as infile:
binary_categorical_dict = dict(pickle.load(infile))
for key in binary_categorical_dict:
unique_vals = list(eda.dataset.data[key].unique())
unique_vals = [x for x in unique_vals if not pd.isnull(x)]
if sorted(unique_vals) != sorted(binary_categorical_dict[key]):
new_values = list(set(eda.dataset.data[key].unique()) - set(binary_categorical_dict[key]))
logging.warning("New Value found in Binary Categorical Variable " + str(key)
+ ", replacing with null value")
for feat in new_values:
logging.warning('\t' + str(feat))
eda.dataset.data[key].replace(new_values, np.nan, inplace=True)
# ordinal decode the variables
try:
with open(self.experiment_path + '/' + self.train_name +
'/exploratory/ordinal_encoding.pickle', 'rb') as infile:
ord_labels = pickle.load(infile)
for feat in ord_labels.index:
temp_y, labels = pd.factorize(eda.dataset.data[feat])
if set(ord_labels.loc[feat]['Category']) == set(labels):
eda.dataset.data[feat] = temp_y
elif len(ord_labels.loc[feat]['Category']) == 2:
new_labels = list(set(labels) - set(ord_labels.loc[feat]['Category']))
labels = ord_labels.loc[feat]['Category']
rename_dict = dict(enumerate(labels))
for lab in new_labels:
rename_dict[None] = lab
rename_dict = {v: k for k, v in rename_dict.items()}
eda.dataset.data.replace({feat: rename_dict}, inplace=True)
ord_labels.loc[feat]['Category'] = list(labels) + new_labels
ord_labels.loc[feat]['Encoding'] = list(range(len(list(labels)))) + [None, ] * len(new_labels)
logging.warning("New Value found in Textual Binary Categorical Variable " + str(feat)
+ ", replacing with null value")
for x in new_labels:
logging.warning('\t' + str(x))
else:
new_labels = list(set(labels) - set(ord_labels.loc[feat]['Category']))
labels = ord_labels.loc[feat]['Category']
rename_dict = dict(enumerate(list(labels) + new_labels))
rename_dict = {v: k for k, v in rename_dict.items()}
eda.dataset.data.replace({feat: rename_dict}, inplace=True)
ord_labels.loc[feat]['Category'] = list(labels) + new_labels
ord_labels.loc[feat]['Encoding'] = list(range(len(list(labels) + new_labels)))
with open(self.full_path + "/replication/" + self.apply_name +
'/exploratory/apply_ordinal_encoding.pickle', 'wb') as outfile:
pickle.dump(ord_labels, outfile)
ord_labels.to_csv(self.full_path + "/replication/" + self.apply_name +
'/exploratory/Numerical_Encoding_Map.csv')
except FileNotFoundError:
pass
# ExploratoryAnalysis - basic data cleaning
eda.drop_ignored_rowcols()
transition_df.loc["C1"] = eda.counts_summary(save=False)
eda.dataset.initial_eda(self.experiment_path + '/' + self.train_name)
# Missingness Feature Reconstruction
# Read all engineered feature names
try:
with open(self.experiment_path + '/' + self.train_name +
'/exploratory/engineered_features.pickle', 'rb') as infile:
eda.engineered_features = pickle.load(infile)
except FileNotFoundError:
eda.engineered_features = list()
# Recreate missingness features in replication phase
for feat in eda.engineered_features:
eda.dataset.data['Miss_' + feat] = eda.dataset.data[feat].isnull().astype(int)
eda.categorical_features.append('Miss_' + feat)
eda.engineered_features = ['Miss_' + feat for feat in eda.engineered_features]
#transition_df.loc["E1"] = eda.counts_summary(save=False)
try:
# Removing dropped features
with open(self.experiment_path + '/' + self.train_name +
'/exploratory/removed_features.pickle', 'rb') as infile:
removed_features = list(pickle.load(infile))
for feat in removed_features:
if feat in eda.categorical_features:
eda.categorical_features.remove(feat)
if feat in eda.quantitative_features:
eda.quantitative_features.remove(feat)
eda.dataset.data.drop(removed_features, axis=1, inplace=True)
except FileNotFoundError:
pass
#transition_df.loc["C2"] = eda.counts_summary(save=False)
try:
with open(self.experiment_path + '/' + self.train_name +
'/exploratory/post_processed_features.pickle', 'rb') as infile:
post_processed_vars = pickle.load(infile)
except Exception as e:
raise e
non_binary_categorical = list()
for feat in eda.categorical_features:
if feat in eda.dataset.data.columns:
if eda.dataset.data[feat].nunique() > 2:
non_binary_categorical.append(feat)
# logging.warning(non_binary_categorical)
if len(non_binary_categorical) > 0:
one_hot_df = pd.get_dummies(eda.dataset.data[non_binary_categorical], columns=non_binary_categorical)
eda.one_hot_features = list(one_hot_df.columns)
eda.dataset.data.drop(non_binary_categorical, axis=1, inplace=True)
eda.dataset.data = pd.concat([eda.dataset.data, one_hot_df], axis=1)
# adding features not seen in test data
for feat in post_processed_vars:
if feat not in list(eda.dataset.data.columns):
eda.dataset.data[feat] = 0
eda.one_hot_features.append(feat)
eda.categorical_features += eda.one_hot_features
try:
with open(self.experiment_path + '/' + self.train_name +
'/exploratory/correlated_features.pickle', 'rb') as infile:
correlated_features = list(pickle.load(infile))
except FileNotFoundError:
correlated_features = list()
# removing extra features
for feat in eda.dataset.data.columns:
if feat not in post_processed_vars and feat not in correlated_features:
eda.drop_ignored_rowcols([feat])
#transition_df.loc["E2"] = eda.counts_summary(save=False)
# Removing highly correlated features
for feat in correlated_features:
if feat in eda.categorical_features:
eda.categorical_features.remove(feat)
if feat in eda.quantitative_features:
eda.quantitative_features.remove(feat)
eda.dataset.data.drop(correlated_features, axis=1, inplace=True)
#transition_df.loc["C4"] = eda.counts_summary(save=False)
eda.categorical_features = list(set(post_processed_vars).intersection(set(eda.categorical_features)))
eda.quantitative_features = list(set(post_processed_vars).intersection(set(eda.quantitative_features)))
if len(list(set(post_processed_vars) - set(eda.quantitative_features + eda.quantitative_features))) > 0:
Exception("Final Variables in Train are not equal to post processed sum of "
"Categorical and Quantitative in Replication phase, something is wrong")
eda.dataset.data = eda.dataset.data[post_processed_vars]
transition_df.loc["R1"] = eda.counts_summary(save=False)
transition_df.to_csv(self.full_path + "/replication/" + self.apply_name + '/exploratory/'
+ 'DataProcessSummary.csv', index=True)
# Pickle list of feature names to be treated as categorical variables
with open(self.full_path + "/replication/" + self.apply_name +
'/exploratory/categorical_features.pickle', 'wb') as outfile:
pickle.dump(eda.categorical_features, outfile)
# Pickle list of processed feature names
with open(self.full_path + "/replication/" + self.apply_name +
'/exploratory/post_processed_features.pickle', 'wb') as outfile:
pickle.dump(list(eda.dataset.data.columns), outfile)
with open(self.full_path + "/replication/" + self.apply_name +
'/exploratory/ProcessedFeatureNames.csv', 'w') as outfile:
writer = csv.writer(outfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
writer.writerow(list(eda.dataset.data.columns))
# Save a copy of the processed replication dataset (used by useful notebook to allign prediction probabilities to instance IDs)
eda.dataset.data.to_csv(self.full_path + "/replication/" + self.apply_name +"/"+self.apply_name+"_Processed.csv", index=False)
# Export basic exploratory analysis files
eda.dataset.describe_data(self.experiment_path + '/' + self.train_name)
total_missing = eda.dataset.missingness_counts(self.experiment_path + '/' + self.train_name)
eda.counts_summary(total_missing, plot=True, replicate=True)
# Create features-only version of dataset for some operations
x_rep_data = eda.dataset.feature_only_data()
# Export feature correlation plot if user specified
if self.export_feature_correlations:
eda.dataset.feature_correlation(self.experiment_path + '/' + self.train_name, x_rep_data, show_plots=False)
del x_rep_data # memory cleanup
# Rep Data Preparation for each Training Partition Model set
# (rep data will potentially be scaled, imputed and feature
# selected in the same was as was done for each corresponding CV training partition)
master_list = [] # Will hold all evalDict's, one for each cv dataset.
cv_dataset_paths = list(glob.glob(self.full_path + "/CVDatasets/*_CV_*Train.csv"))
cv_dataset_paths = [str(Path(cv_dataset_path)) for cv_dataset_path in cv_dataset_paths]
cv_partitions = len(cv_dataset_paths)
for cv_count in range(0, cv_partitions):
# Get corresponding training CV dataset
cv_train_path = self.full_path + "/CVDatasets/" + self.train_name + '_CV_' + str(cv_count) + '_Train.csv'
cv_train_data = pd.read_csv(cv_train_path, na_values='NA', sep=",")
# Get List of features in cv dataset
# (if feature selection took place this may only include a subset of original training data features)
train_feature_list = list(cv_train_data.columns.values)
train_feature_list.remove(self.class_label)
if self.instance_label is not None:
if self.instance_label in train_feature_list:
train_feature_list.remove(self.instance_label)
if self.match_label is not None:
train_feature_list.remove(self.match_label)
# Working copy of original dataframe -
# a new version will be created for each CV partition to be applied to each corresponding set of models
cv_rep_data = rep_data.data.copy()
# Impute dataframe based on training imputation
# if self.ignore_features is not None:
# for feature in self.ignore_features:
# if feature in all_train_feature_list:
# feature_name_list.remove(feature)
#
# if removed_features:
# for feature in removed_features:
# if feature in all_train_feature_list:
# feature_name_list.remove(feature)
#
# if correlated_features:
# for feature in correlated_features:
# if feature in all_train_feature_list:
# feature_name_list.remove(feature)
# one_hot_list = list()
# for var in post_processed_vars:
# if var not in all_train_feature_list:
# one_hot_list.append(var)
#
# feature_name_list = all_train_feature_list + engineered_features + one_hot_list
feature_name_list = list(post_processed_vars)
feature_name_list.remove(eda.dataset.class_label)
if eda.dataset.instance_label:
feature_name_list.remove(eda.dataset.instance_label)
if eda.dataset.match_label:
feature_name_list.remove(eda.dataset.match_label)
if self.impute_data:
try:
# assumes imputation was actually run in training (i.e. user had impute_data setting as 'True')
cv_rep_data = self.impute_rep_data(cv_count, cv_rep_data, feature_name_list,
eda.categorical_features, eda.quantitative_features)
except Exception as e:
logging.warning("Unknow Exception in Imputation: "
+ str(self.apply_name))
logging.warning(e)
# raise e
# Scale dataframe based on training scaling
if self.scale_data:
try:
# assumes imputation was actually run in training (i.e. user had impute_data setting as 'True')
cv_rep_data = self.scale_rep_data(cv_count, cv_rep_data, feature_name_list)
except Exception as e:
# If there was no imputation data in respective dataset,
# thus no imputation files were created, bypass loading of imputation data.
# Requires new replication data to have no missing values, as there is no
# established internal scheme to conduct imputation.
# logging.warning(e)
logging.warning("Notice: Scaling was not conducted for the following target dataset, "
"so scaling was not conducted for replication data: "
+ str(self.apply_name))
# raise e
# Conduct feature selection based on training selection
# (Filters out any features not in the final cv training dataset)
cv_rep_data = cv_rep_data[cv_train_data.columns]
del cv_train_data # memory cleanup
# Prep data for evaluation
if self.instance_label is not None:
cv_rep_data = cv_rep_data.drop(self.instance_label, axis=1)
x_test = cv_rep_data.drop(self.class_label, axis=1).values
y_test = cv_rep_data[self.class_label].values
# Unpickle algorithm info from training phases of pipeline
eval_dict = dict()
for algorithm in self.algorithms:
ret = self.eval_model(algorithm, cv_count, x_test, y_test)
eval_dict[algorithm] = ret
pickle.dump(ret, open(self.full_path + "/replication/"
+ self.apply_name + '/model_evaluation/pickled_metrics/'
+ ABBREVIATION[algorithm] + '_CV_'
+ str(cv_count) + "_metrics.pickle", 'wb'))
# includes everything from training except feature importance values
master_list.append(eval_dict) # update master list with evalDict for this CV model
stats = StatsJob(self.full_path + '/replication/' + self.apply_name,
self.algorithms, self.class_label, self.instance_label, self.scoring_metric,
cv_partitions=self.cv_partitions, top_features=40, sig_cutoff=self.sig_cutoff,
metric_weight='balanced_accuracy', scale_data=self.scale_data,
exclude_plots=self.exclude_plots, show_plots=self.show_plots)
result_table, metric_dict = stats.primary_stats(master_list, rep_data.data)
stats.do_plot_roc(result_table)
stats.do_plot_prc(result_table, rep_data.data, True)
metrics = list(metric_dict[self.algorithms[0]].keys())
stats.save_metric_stats(metrics, metric_dict)
if self.plot_metric_boxplots:
stats.metric_boxplots(metrics, metric_dict)
# Save Kruskal Wallis, Mann Whitney, and Wilcoxon Rank Sum Stats
if len(self.algorithms) > 1:
kruskal_summary = stats.kruskal_wallis(metrics, metric_dict)
stats.mann_whitney_u(metrics, metric_dict, kruskal_summary)
stats.wilcoxon_rank(metrics, metric_dict, kruskal_summary)
# Print phase completion
logging.info(self.apply_name + " phase 9 complete")
job_file = open(self.experiment_path + '/jobsCompleted/job_apply_' + self.apply_name + '.txt', 'w')
job_file.write('complete')
job_file.close()
[docs]
def impute_rep_data(self, cv_count, cv_rep_data, all_train_feature_list, cat_features, quant_features):
# Impute categorical features (i.e. those included in the mode_dict)
try:
impute_cat_info = self.full_path + '/scale_impute/categorical_imputer_cv' + str(
cv_count) + '.pickle' # Corresponding pickle file name with scalingInfo
infile = open(impute_cat_info, 'rb')
mode_dict = pickle.load(infile)
infile.close()
for c in cv_rep_data.columns:
if c in mode_dict: # was the given feature identified as and treated as categorical during training?
cv_rep_data[c].fillna(mode_dict[c], inplace=True)
except Exception as e:
# If there was no missing data in respective dataset,
# thus no imputation files were created, bypass loading of imputation data and do simple imputation
if cv_rep_data.isna().sum().sum() > 0:
logging.warning("Notice: Categorical Imputation was not conducted for the following target dataset "
"so categorical values were imputed using the median:"
+ str(self.apply_name))
for feat in cat_features:
if cv_rep_data[feat].isnull().sum() > 0:
cv_rep_data[feat].fillna(cv_rep_data[feat].median(), inplace=True)
impute_rep_df = None
try:
impute_oridinal_info = self.full_path + '/scale_impute/ordinal_imputer_cv' + str(
cv_count) + '.pickle' # Corresponding pickle file name with scalingInfo
if self.multi_impute: # multiple imputation of quantitative features
infile = open(impute_oridinal_info, 'rb')
imputer = pickle.load(infile)
infile.close()
inst_rep = None
# Prepare data for scikit imputation
if self.instance_label is None or self.instance_label == 'None':
x_rep = cv_rep_data.drop([self.class_label], axis=1).values
else:
x_rep = cv_rep_data.drop([self.class_label, self.instance_label], axis=1).values
inst_rep = cv_rep_data[self.instance_label].values # pull out instance labels in case they include text
y_rep = cv_rep_data[self.class_label].values
x_rep_impute = imputer.transform(x_rep)
# Recombine x and y
if self.instance_label is None or self.instance_label == 'None':
impute_rep_df = pd.concat([pd.DataFrame(y_rep, columns=[self.class_label]),
pd.DataFrame(x_rep_impute, columns=all_train_feature_list)], axis=1,
sort=False)
else:
impute_rep_df = pd.concat(
[pd.DataFrame(y_rep, columns=[self.class_label]),
pd.DataFrame(inst_rep, columns=[self.instance_label]),
pd.DataFrame(x_rep_impute, columns=all_train_feature_list)], axis=1, sort=False)
else: # simple (median) imputation of quantitative features
infile = open(impute_oridinal_info, 'rb')
median_dict = pickle.load(infile)
infile.close()
for c in cv_rep_data.columns:
if c in median_dict: # was the given feature identified as and treated as categorical during training?
cv_rep_data[c].fillna(median_dict[c], inplace=True)
except FileNotFoundError:
# If there was no missing data in respective dataset,
# thus no imputation files were created, bypass loading of imputation data and do simple imputation
if cv_rep_data.isna().sum().sum() > 0:
logging.warning("Notice: Quantitative Imputation was not conducted for the following target dataset "
"so quantitative values were imputed with the mean: "
+ str(self.apply_name))
for feat in quant_features:
if cv_rep_data[feat].isnull().sum() > 0:
cv_rep_data[feat].fillna(cv_rep_data[feat].mean(), inplace=True)
impute_rep_df = cv_rep_data
return impute_rep_df
[docs]
def scale_rep_data(self, cv_count, cv_rep_data, all_train_feature_list):
# Corresponding pickle file name with scalingInfo
scale_info = self.full_path + '/scale_impute/scaler_cv' + str(
cv_count) + '.pickle'
infile = open(scale_info, 'rb')
scaler = pickle.load(infile)
decimal_places = 7
infile.close()
inst_rep = None
# Scale target replication data
if self.instance_label is None or self.instance_label == 'None':
x_rep = cv_rep_data.drop([self.class_label], axis=1)
else:
x_rep = cv_rep_data.drop([self.class_label, self.instance_label], axis=1)
inst_rep = cv_rep_data[self.instance_label] # pull out instance labels in case they include text
y_rep = cv_rep_data[self.class_label]
# Scale features (x)
x_rep_scaled = pd.DataFrame(scaler.transform(x_rep).round(decimal_places), columns=x_rep.columns)
# Recombine x and y
if self.instance_label is None or self.instance_label == 'None':
scale_rep_df = pd.concat([pd.DataFrame(y_rep, columns=[self.class_label]),
pd.DataFrame(x_rep_scaled, columns=all_train_feature_list)], axis=1, sort=False)
else:
scale_rep_df = pd.concat(
[pd.DataFrame(y_rep, columns=[self.class_label]), pd.DataFrame(inst_rep, columns=[self.instance_label]),
pd.DataFrame(x_rep_scaled, columns=all_train_feature_list)], axis=1, sort=False)
return scale_rep_df
[docs]
def eval_model(self, algorithm, cv_count, x_test, y_test):
model_info = self.full_path + '/models/pickledModels/' + ABBREVIATION[algorithm] + '_' \
+ str(cv_count) + '.pickle'
# Corresponding pickle file name with scalingInfo
infile = open(model_info, 'rb')
model = pickle.load(infile)
infile.close()
# Prediction evaluation
m = BaseModel(None, algorithm, scoring_metric=self.scoring_metric)
m.model = model
m.model_name = algorithm
m.small_name = ABBREVIATION[algorithm]
metric_list, fpr, tpr, roc_auc, prec, recall, \
prec_rec_auc, ave_prec, probas_ = m.model_evaluation(x_test, y_test)
return [metric_list, fpr, tpr, roc_auc, prec, recall, prec_rec_auc, ave_prec, None, probas_]