import glob
import logging
import math
import os
import pickle
import csv
from datetime import datetime
from pathlib import Path
from streamline import __version__ as version
import pandas as pd
from fpdf import FPDF
from streamline.modeling.utils import ABBREVIATION, COLORS, is_supported_model, SUPPORTED_MODELS
from streamline.utils.job import Job
[docs]
class ReportJob(Job):
"""
This 'Job' script is called by PDF_ReportMain.py which generates a formatted PDF summary report of key
pipeline results It is run once for the whole pipeline analysis.
"""
def __init__(self, output_path=None, experiment_name=None, experiment_path=None, algorithms=None,
exclude=("XCS", "eLCS"),
training=True, data_path=None, rep_data_path=None, load_algo=True):
super().__init__()
self.time = None
assert (output_path is not None and experiment_name is not None) or (experiment_path is not None)
if output_path is not None and experiment_name is not None:
self.output_path = output_path
self.experiment_name = experiment_name
self.experiment_path = self.output_path + '/' + self.experiment_name
else:
self.experiment_path = experiment_path
self.experiment_name = self.experiment_path.split('/')[-1]
self.output_path = self.experiment_path.split('/')[-2]
self.training = training
self.train_name = None
# Find folders inside directory
if self.training:
self.datasets = os.listdir(self.experiment_path)
remove_list = ['.DS_Store', 'metadata.pickle', 'metadata.csv', 'algInfo.pickle',
'DatasetComparisons', 'jobs', 'jobsCompleted', 'logs',
'KeyFileCopy', 'dask_logs',
experiment_name + '_STREAMLINE_Report.pdf']
for item in remove_list:
if item in self.datasets:
self.datasets.remove(item)
if '.idea' in self.datasets:
self.datasets.remove('.idea')
self.datasets = sorted(self.datasets)
else:
self.train_name = data_path.split('/')[-1].split('.')[0]
self.datasets = []
for dataset_filename in glob.glob(rep_data_path + '/*'):
dataset_filename = str(Path(dataset_filename).as_posix())
# dataset_filename = str(dataset_filename).replace('\\', '/')
# Save unique dataset names so that analysis is run only once if there is both a
# .txt and .csv version of dataset with same name.
apply_name = dataset_filename.split('/')[-1].split('.')[0]
self.datasets.append(apply_name)
self.datasets = sorted(self.datasets)
dataset_directory_paths = []
for dataset in self.datasets:
full_path = self.experiment_path + "/" + dataset
dataset_directory_paths.append(full_path)
self.dataset_directory_paths = dataset_directory_paths
if algorithms is None:
self.algorithms = SUPPORTED_MODELS
if exclude is not None:
for algorithm in exclude:
try:
self.algorithms.remove(algorithm)
except Exception:
Exception("Unknown algorithm in exclude: " + str(algorithm))
else:
self.algorithms = list()
for algorithm in algorithms:
self.algorithms.append(is_supported_model(algorithm))
# Unpickle metadata from previous phase
file = open(self.experiment_path + '/' + "metadata.pickle", 'rb')
self.metadata = pickle.load(file)
file.close()
file = open(self.experiment_path + '/' + "algInfo.pickle", 'rb')
self.alg_info = pickle.load(file)
file.close()
# self.metadata = {}
if load_algo:
temp_algo = []
for key in self.alg_info:
if self.alg_info[key][0]:
temp_algo.append(key)
self.algorithms = temp_algo
self.abbrev = dict((k, ABBREVIATION[k]) for k in self.algorithms if k in ABBREVIATION)
self.colors = dict((k, COLORS[k]) for k in self.algorithms if k in COLORS)
self.metrics = None
self.analysis_report = FPDF('P', 'mm', 'A4')
[docs]
def run(self):
self.job()
[docs]
def job(self):
self.job_start_time = datetime.now()
self.time = datetime.now()
# Turn metadata dictionary into text list
ars_dic = []
for key in self.metadata:
ars_dic.append(str(key) + ':')
ars_dic.append(str(self.metadata[key]))
ars_dic.append('\n')
# Turn alg_info dictionary into text list
ars_dic_2 = []
for key in sorted(self.alg_info.keys()):
ars_dic_2.append(str(key) + ':')
ars_dic_2.append(str(self.alg_info[key][0]))
ars_dic_2.append('\n')
# Analysis Settings, Global Analysis Settings, ML Modeling Algorithms
self.analysis_report.set_margins(left=10, top=5, right=10, )
self.analysis_report.add_page(orientation='P')
# PDF page dimension reference
# page width = 210 and page height down to start of footer = 285 (these are estimates)
# FRONT PAGE - Summary of Pipeline settings
# -------------------------------------------------------------------------------------------------------
logging.info("Starting Report")
targetdata = ars_dic[0:27] # Data-path to Specified Quantitative Features
cv = ars_dic[27:33] # cv partitions to partition Method
cat_cut = ars_dic[33:36] # categorical cutoff
stat_cut = ars_dic[36:39] # statistical significance cutoff
process = ars_dic[39:54] # feature missingness cutoff to list of exploratory plots saved
general = ars_dic[54:60] # random seed to run from notebooks
process2 = ars_dic[60:69] # use data scaling to use multivariate imputation
featsel = ars_dic[69:96] # use mutual info to export feature importance plots
overwrite = ars_dic[96:99] # overwrite cv
modeling = ars_dic[99:117] # primary metric to export hyperparameter sweep plots
lcs = ars_dic[117:132]
stats = ars_dic[132:153]
#targetdata = ars_dic[0:21] # Data-path to instance label
#cv = ars_dic[21:27] # cv partitions to partition Method
#match = ars_dic[27:30] # match label
#cat_cut = ars_dic[30:33] # categorical cutoff
#stat_cut = ars_dic[33:36] # statistical significance cutoff
#process = ars_dic[36:51] # feature missingness cutoff to list of exploratory plots saved
#general = ars_dic[51:57] # random seed to run from notebooks
#process2 = ars_dic[57:66] # use data scaling to use multivariate imputation
#featsel = ars_dic[66:93] # use mutual info to export feature importance plots
#overwrite = ars_dic[93:96] # overwrite cv
#modeling = ars_dic[96:114] # primary metric to export hyperparameter sweep plots
#lcs = ars_dic[114:129]
#stats = ars_dic[129:150]
ls2 = ars_dic_2
# Report Title
self.analysis_report.set_font('Times', 'B', 12)
if self.training:
self.analysis_report.cell(w=180, h=8, txt='STREAMLINE Testing Data Evaluation Report: ' + str(self.time), ln=2,
border=1, align='L')
else:
self.analysis_report.cell(w=180, h=8, txt='STREAMLINE Replication Data Evaluation Report: ' + str(self.time),
ln=2, border=1, align='L')
self.analysis_report.y += 2 # Margin below page header
#Begin Settings
top_of_list = self.analysis_report.y # Page height for start of algorithm settings
self.analysis_report.set_font('Times', 'B', 9)
self.analysis_report.multi_cell(w=69, h=4, txt='General Pipeline Settings:', border=1, align='L')
self.analysis_report.y += 1 # Space below section header
self.analysis_report.set_font('Times', '', 7)
self.analysis_report.multi_cell(w=69, h=4,
txt=' ' + list_to_string(cv) + ' ' + list_to_string(
cat_cut) + ' ' + list_to_string(stat_cut) + ' ' + list_to_string(
general),
border=1, align='L')
self.analysis_report.y += 1 # Space below section header
self.analysis_report.set_font('Times', 'B', 9)
self.analysis_report.multi_cell(w=69, h=4, txt='Feature Importance/Selection Settings:', border=1, align='L')
self.analysis_report.y += 1 # Space below section header
self.analysis_report.set_font('Times', '', 7)
self.analysis_report.multi_cell(w=69, h=4,
txt=' ' + list_to_string(featsel),
border=1, align='L')
self.analysis_report.set_font('Times', 'B', 9)
self.analysis_report.multi_cell(w=69, h=4, txt='ML Modeling Algorithms:', border=1, align='L')
self.analysis_report.y += 1 # Space below section header
self.analysis_report.set_font('Times', '', 7)
self.analysis_report.multi_cell(w=69, h=4, txt=' ' + list_to_string(ls2), border=1, align='L')
self.analysis_report.y += 1
self.analysis_report.set_font('Times', 'B', 9)
self.analysis_report.multi_cell(w=69, h=4, txt='Modeling Settings:', border=1, align='L')
self.analysis_report.y += 1 # Space below section header
self.analysis_report.set_font('Times', '', 7)
self.analysis_report.multi_cell(w=69, h=4, txt=' ' + list_to_string(modeling), border=1, align='L')
self.analysis_report.y += 1
self.analysis_report.set_font('Times', 'B', 9)
self.analysis_report.multi_cell(w=69, h=4, txt='LCS Settings (eLCS,XCS,ExSTraCS):', border=1, align='L')
self.analysis_report.y += 1 # Space below section header
self.analysis_report.set_font('Times', '', 7)
self.analysis_report.multi_cell(w=69, h=4, txt=' ' + list_to_string(lcs), border=1, align='L')
self.analysis_report.y += 1
self.analysis_report.set_font('Times', 'B', 9)
self.analysis_report.multi_cell(w=69, h=4, txt='Stats and Figure Settings:', border=1, align='L')
self.analysis_report.y += 1 # Space below section header
self.analysis_report.set_font('Times', '', 7)
self.analysis_report.multi_cell(w=69, h=4, txt=' ' + list_to_string(stats), border=1, align='L')
self.analysis_report.x += 70
self.analysis_report.y = top_of_list # 96
self.analysis_report.set_font('Times', 'B', 9)
self.analysis_report.multi_cell(w=110, h=4, txt='EDA and Processing Settings:', border=1, align='L')
self.analysis_report.x += 70
self.analysis_report.y += 1 # Space below section header
self.analysis_report.set_font('Times', '', 7)
self.analysis_report.multi_cell(w=110, h=4,
txt=' ' + list_to_string(process) + ' ' + list_to_string(
process2) + ' ' + list_to_string(overwrite),
border=1, align='L')
self.analysis_report.y += 1 # Space below section header
self.analysis_report.x += 70
if self.training:
# Get names of self.datasets run in analysis
list_datasets = ''
i = 1
for each in self.datasets:
list_datasets = list_datasets + ('D' + str(i) + ' = ' + str(each) + '\n')
i += 1
# Report self.datasets
self.analysis_report.set_font('Times', 'B', 9)
self.analysis_report.multi_cell(w=110, h=4, txt='Target Dataset(s):', border=1, align='L')
self.analysis_report.x += 70
self.analysis_report.y += 1 # Space below section header
self.analysis_report.set_font('Times', '', 7)
self.analysis_report.multi_cell(w=110, h=4, txt=list_datasets, border=1, align='L')
else:
list_datasets = ''
i = 1
for each in self.datasets:
list_datasets = list_datasets + ('D' + str(i) + ' = ' + str(each) + '\n')
i += 1
self.analysis_report.set_font('Times', 'B', 9)
self.analysis_report.multi_cell(w=110, h=4, txt='Target Training Dataset:', border=1, align='L')
self.analysis_report.x += 70
self.analysis_report.y += 1 # Space below section header
self.analysis_report.set_font('Times', '', 7)
self.analysis_report.multi_cell(w=110, h=4, txt=self.train_name, border=1, align='L')
#self.analysis_report.y += 5
#self.analysis_report.x = 70 #10
self.analysis_report.x += 70
self.analysis_report.y += 1 # Space below section header
self.analysis_report.set_font('Times', 'B', 9)
self.analysis_report.multi_cell(w=110, h=4, txt='Applied to Following Replication Dataset(s):', border=1, align='L')
self.analysis_report.x += 70
self.analysis_report.y += 1 # Space below section header
self.analysis_report.set_font('Times', '', 7)
self.analysis_report.multi_cell(w=110, h=4, txt= list_datasets, border=1, align='L')
#self.analysis_report.multi_cell(w=180, h=4, txt='Applied to Following Replication Dataset(s): ' + '\n' + list_datasets, border=1, align='L')
self.analysis_report.x += 70
self.analysis_report.y += 1 # Space below section header
self.analysis_report.set_font('Times', 'B', 9)
self.analysis_report.multi_cell(w=110, h=4, txt='Target Data Settings:', border=1, align='L')
self.analysis_report.x += 70
self.analysis_report.y += 1 # Space below section header
self.analysis_report.set_font('Times', '', 7)
self.analysis_report.multi_cell(w=110, h=4,
txt=' ' + list_to_string(targetdata),
border=1, align='L')
#self.analysis_report.y += 2 # Margin below Datasets
#self.analysis_report.y += 2 # Margin below Datasets
#self.analysis_report.set_font('Times', 'B', 10)
#self.analysis_report.cell(w=180, h=4, txt='STREAMLINE Run Settings', ln=2, border=1, align='L')
#bottom_of_list = self.analysis_report.y
#self.analysis_report.y = bottom_of_list + 2
"""
try_again = True
try:
self.analysis_report.image('info/Pictures/STREAMLINE_LOGO.png', 102, 150, 90)
try_again = False
except Exception:
pass
if try_again:
try: # Running on Google Colab
self.analysis_report.image('/content/drive/MyDrive/STREAMLINE/info/Pictures/STREAMLINE_LOGO.png', 102, 150,
90)
except Exception:
pass
"""
"""
ls1 = ars_dic[0:87] # DataPath to OverwriteCVDatasets - filter poor [0:87]
# ls2 = ars_dic[87:132] # ML modeling algorithms (NaiveB - ExSTraCS) [87:132]
ls2 = ars_dic_2
ls3 = ars_dic[87:105] # primary metric - Export Hyperparameter SweepPLot [132:150]
ls4 = ars_dic[105:129] # DoLCS Hyperparameter Sweep LCS hyper-sweep timeout) [150:165]
ls5 = ars_dic[129:147] # ExportROCPlot to Top Model Features to Display [165:180]
self.analysis_report.set_font('Times', 'B', 12)
if self.training:
self.analysis_report.cell(w=180, h=8, txt='STREAMLINE Testing Evaluation Report: ' + str(self.time), ln=2,
border=1, align='L')
else:
self.analysis_report.cell(w=180, h=8, txt='STREAMLINE Replication Evaluation Report: ' + str(self.time),
ln=2, border=1, align='L')
self.analysis_report.y += 2 # Margin below page header
top_of_list = self.analysis_report.y # Page height for start of algorithm settings
self.analysis_report.set_font('Times', 'B', 10)
self.analysis_report.multi_cell(w=90, h=4, txt='General Pipeline Settings:', border=1, align='L')
self.analysis_report.y += 1 # Space below section header
self.analysis_report.set_font('Times', '', 8)
self.analysis_report.multi_cell(w=90, h=4,
txt=' ' + list_to_string(ls1) + ' ' + list_to_string(
ls3) + ' ' + list_to_string(
ls5),
border=1, align='L')
bottom_of_list = self.analysis_report.y
self.analysis_report.x += 90
self.analysis_report.y = top_of_list # 96
self.analysis_report.set_font('Times', 'B', 10)
self.analysis_report.multi_cell(w=90, h=4, txt='ML Modeling Algorithms:', border=1, align='L')
self.analysis_report.y += 1 # Space below section header
self.analysis_report.set_font('Times', '', 8)
self.analysis_report.x += 90
self.analysis_report.multi_cell(w=90, h=4, txt=' ' + list_to_string(ls2), border=1, align='L')
self.analysis_report.x += 90
self.analysis_report.y += 2
self.analysis_report.set_font('Times', 'B', 10)
self.analysis_report.multi_cell(w=90, h=4, txt='LCS Settings (eLCS,XCS,ExSTraCS):', border=1, align='L')
self.analysis_report.y += 1 # Space below section header
self.analysis_report.set_font('Times', '', 8)
self.analysis_report.x += 90
self.analysis_report.multi_cell(w=90, h=4, txt=' ' + list_to_string(ls4), border=1, align='L')
self.analysis_report.y = bottom_of_list + 2
try_again = True
try:
self.analysis_report.image('info/Pictures/STREAMLINE_LOGO.png', 102, 150, 90)
try_again = False
except Exception:
pass
if try_again:
try: # Running on Google Colab
self.analysis_report.image('/content/drive/MyDrive/STREAMLINE/info/Pictures/STREAMLINE_LOGO.png', 102, 150,
90)
except Exception:
pass
if self.training:
# Get names of self.datasets run in analysis
list_datasets = ''
i = 1
for each in self.datasets:
list_datasets = list_datasets + ('D' + str(i) + ' = ' + str(each) + '\n')
i += 1
# Report self.datasets
self.analysis_report.set_font('Times', 'B', 10)
self.analysis_report.multi_cell(w=180, h=4, txt='Datasets', border=1, align='L')
self.analysis_report.y += 1 # Space below section header
self.analysis_report.set_font('Times', '', 8)
self.analysis_report.multi_cell(w=180, h=4, txt=list_datasets, border=1, align='L')
else:
self.analysis_report.cell(w=180, h=4, txt='Target Training Dataset: ' + self.train_name, border=1,
align='L')
self.analysis_report.y += 5
self.analysis_report.x = 10
list_datasets = ''
i = 1
for each in self.datasets:
list_datasets = list_datasets + ('D' + str(i) + ' = ' + str(each) + '\n')
i += 1
self.analysis_report.multi_cell(w=180, h=4, txt='Applied self.datasets: ' + '\n' + list_datasets, border=1,
align='L')
"""
self.footer()
# NEXT PAGE(S) - Exploratory Univariate Analysis for each Dataset
# ------------------------------------------------------------------
if self.training:
logging.info("Publishing Univariate Analysis")
result_limit = 5 # Limits to this many dataset results per page
dataset_count = len(self.datasets)
# Determine number of pages needed for univariate results
page_count = dataset_count / float(result_limit)
page_count = math.ceil(page_count) # rounds up to next full integer
for page in range(0, page_count): # generate each page
self.pub_univariate(page, result_limit, page_count)
# NEXT PAGE(S) Data and Model Prediction Summary
# --------------------------------------------------------------------------------------
M = None
logging.info("Publishing Model Prediction Summary")
for m in range(len(self.datasets)):
M = m
# Create PDF and Set Options
self.analysis_report.set_margins(left=1, top=1, right=1, )
self.analysis_report.add_page()
self.analysis_report.set_font('Times', 'B', 12)
self.analysis_report.cell(w=0, h=8,
txt="Dataset and Model Prediction Summary: D" + str(m + 1) + " = " +
self.datasets[m],
border=1, align="L", ln=2)
self.analysis_report.set_font(family='times', size=8)
# Exploratory Analysis ----------------------------
# Image placement notes:
# upper left hand coordinates (x,y), then image width then height (image fit to space)
# upper left hand coordinates (x,y), then image width with height based on image dimensions
# (retain original image ratio)
# Insert Data Processing Count Summary
self.analysis_report.set_font('Times', 'B', 10)
self.analysis_report.x = 1
self.analysis_report.y = 10
self.analysis_report.cell(119, 4, 'Data Processing/Counts Summary', 1, align="L")
self.analysis_report.x = 1
self.analysis_report.y = 15
self.analysis_report.set_font('Times', '', 7)
self.analysis_report.set_fill_color(200)
if self.training:
data_process_path = self.experiment_path + '/' + self.datasets[
m] + "/exploratory/DataProcessSummary.csv"
else:
data_process_path = self.experiment_path + '/' + self.train_name + '/replication/' + self.datasets[
m] + "/exploratory/DataProcessSummary.csv"
table1 = [] # Initialize an empty list to store the data
with open(data_process_path, "r") as csv_file:
csv_reader = csv.reader(csv_file)
for row in csv_reader:
table1.append(row)
# Format
# data_summary = data_summary.round(3)
th = self.analysis_report.font_size
col_width_list = [13, 13, 13, 14, 14, 13, 13, 13, 13] # 91 x space total
# Print table header first
row_count = 0
col_count = 0
previous_row = None
for row in table1: # each row
# Make header
if row_count == 0:
for datum in row: # Print first row
entry_list = str(datum).split(' ')
self.analysis_report.cell(col_width_list[col_count], th, entry_list[0], border=0, align="C")
col_count += 1
self.analysis_report.ln(th) # critical
col_count = 0
for datum in row: # Print second row
entry_list = str(datum).split(' ')
try:
self.analysis_report.cell(col_width_list[col_count], th, entry_list[1], border=0, align="C")
except Exception:
self.analysis_report.cell(col_width_list[col_count], th, ' ', border=0, align="C")
col_count += 1
self.analysis_report.ln(th) # critical
col_count = 0
# Fill in data
elif row_count == 1:
previous_row = row
for datum in row:
if col_count == 0:
self.analysis_report.cell(col_width_list[col_count], th, str(datum), border=1, align="L",
fill=True)
elif col_count == 6: # missing percent column
self.analysis_report.cell(col_width_list[col_count], th, str(round(float(datum), 4)),
border=1, align="L", fill=True)
else:
self.analysis_report.cell(col_width_list[col_count], th, str(int(float(datum))), border=1,
align="L", fill=True)
col_count += 1
self.analysis_report.ln(th) # critical
col_count = 0
else:
for datum in row:
if col_count == 0:
self.analysis_report.cell(col_width_list[col_count], th, str(datum), border=1, align="L")
elif str(previous_row[col_count]) == str(row[col_count]): # Value unchanged
if col_count == 6: # missing percent column
self.analysis_report.cell(col_width_list[col_count], th, str(round(float(datum), 4)),
border=1, align="L")
else:
self.analysis_report.cell(col_width_list[col_count], th, str(int(float(datum))),
border=1, align="L")
else:
if col_count == 6: # missing percent column
self.analysis_report.cell(col_width_list[col_count], th, str(round(float(datum), 4)),
border=1, align="L", fill=True)
else:
self.analysis_report.cell(col_width_list[col_count], th, str(int(float(datum))),
border=1, align="L", fill=True)
col_count += 1
self.analysis_report.ln(th) # critical
col_count = 0
previous_row = row
row_count += 1
row_count -= 1
for datum in table1[row_count]:
if col_count == 0:
self.analysis_report.cell(col_width_list[col_count], th, 'Processed', border=1, align="L",
fill=True)
else:
if col_count == 6: # missing percent column
self.analysis_report.cell(col_width_list[col_count], th, str(round(float(datum), 4)), border=1,
align="L", fill=True)
else:
self.analysis_report.cell(col_width_list[col_count], th, str(int(float(datum))), border=1,
align="L", fill=True)
col_count += 1
if self.training:
self.analysis_report.set_font('Times', 'B', 8)
self.analysis_report.x = 1
self.analysis_report.y = 41
self.analysis_report.cell(90, 4, 'Cleaning (C) and Engineering (E) Elements', 0, align="L")
self.analysis_report.set_font('Times', '', 7)
self.analysis_report.ln(th) # critical
self.analysis_report.cell(90, 4, ' * C1 - Remove instances with no outcome and features to ignore', 0,
align="L")
self.analysis_report.ln(th) # critical
self.analysis_report.cell(90, 4, ' * E1 - Add missingness features',
0, align="L")
self.analysis_report.ln(th) # critical
self.analysis_report.cell(90, 4, ' * C2 - Remove features with invariance or high missingness', 0, align="L")
self.analysis_report.ln(th) # critical
self.analysis_report.cell(90, 4, ' * C3 - Remove instances with high missingness', 0, align="L")
self.analysis_report.ln(th) # critical
self.analysis_report.cell(90, 4, ' * E2 - Add one-hot-encoding of categorical features', 0, align="L")
self.analysis_report.ln(th) # critical
self.analysis_report.cell(90, 4, ' * C4 - Remove highly correlated features', 0, align="L")
else:
self.analysis_report.set_font('Times', 'B', 8)
self.analysis_report.x = 1
self.analysis_report.y = 41
self.analysis_report.cell(90, 4, 'Cleaning (C) and Replication (R) Elements', 0, align="L")
self.analysis_report.set_font('Times', '', 7)
self.analysis_report.ln(th) # critical
self.analysis_report.cell(90, 4, ' * C1 - Remove instances with no outcome', 0,
align="L")
self.analysis_report.ln(th) # critical
self.analysis_report.cell(90, 4, ' * R1 - Add/remove same features as Phase 1',
0, align="L")
# Insert Class Imbalance barplot
self.analysis_report.set_font('Times', 'B', 10)
self.analysis_report.x = 70
self.analysis_report.y = 42
self.analysis_report.cell(45, 4, 'Class Balance (Processed)', 1, align="L")
self.analysis_report.set_font('Times', '', 8)
if self.training:
self.analysis_report.image(
self.experiment_path + '/' + self.datasets[m] + '/exploratory/ClassCountsBarPlot.png', 68, 47, 45,
35)
# upper left hand coordinates (x,y), then image width then height (image fit to space)
else:
self.analysis_report.image(
self.experiment_path + '/' + self.train_name + '/replication/' + self.datasets[
m] + '/exploratory/ClassCountsBarPlot.png', 68, 47, 45, 35)
# upper left hand coordinates (x,y), then image width then height (image fit to space)
# Insert Feature Correlation Plot
try:
self.analysis_report.set_font('Times', 'B', 10)
self.analysis_report.x = 143
self.analysis_report.y = 42
self.analysis_report.cell(50, 4, 'Feature Correlations (Pearson)', 1, align="L")
self.analysis_report.set_font('Times', '', 8)
if self.training:
self.analysis_report.image(
self.experiment_path + '/' + self.datasets[m] + '/exploratory/FeatureCorrelations.png',
120, 47, 89, 70)
# self.experiment_path + '/' + self.datasets[m] + '/exploratory/FeatureCorrelations.png',
# 85, 15, 125, 100)
# upper left hand coordinates (x,y),
# then image width with hight based on image dimensions (retain original image ratio)
else:
self.analysis_report.image(
self.experiment_path + '/' + self.train_name + '/replication/' + self.datasets[
m] + '/exploratory/FeatureCorrelations.png', 120, 47, 89, 70)
# self.experiment_path + '/' + self.train_name + '/applymodel/' + self.datasets[
# m] + '/exploratory/FeatureCorrelations.png', 85, 15, 125, 100)
# upper left hand coordinates (x,y),
# then image width with hight based on image dimensions (retain original image ratio)
except Exception:
self.analysis_report.x = 135
self.analysis_report.y = 60
self.analysis_report.cell(35, 4, 'No Feature Correlation Plot', 1, align="L")
pass
""" #REMOVED FOR REFORMATTING
if self.training:
data_summary = pd.read_csv(
self.experiment_path + '/' + self.datasets[m] + "/exploratory/DataCounts.csv")
else:
data_summary = pd.read_csv(
self.experiment_path + '/' + self.train_name + '/applymodel/' + self.datasets[
m] + "/exploratory/DataCounts.csv")
info_ls = []
for i in range(len(data_summary)):
info_ls.append(data_summary.iloc[i, 0] + ': ')
info_ls.append(str(data_summary.iloc[i, 1]))
info_ls.append('\n')
self.analysis_report.x = 1
self.analysis_report.y = 52
self.analysis_report.set_font('Times', 'B', 8)
self.analysis_report.multi_cell(w=60, h=4, txt='Dataset Counts Summary:', border=1, align='L')
self.analysis_report.set_font('Times', '', 8)
self.analysis_report.multi_cell(w=60, h=4, txt=' ' + list_to_string(info_ls), border=1, align='L')
"""
# Report Best Algorithms by metric
if self.training:
summary_performance = pd.read_csv(
self.experiment_path + '/' + self.datasets[m] + "/model_evaluation/Summary_performance_mean.csv")
else:
summary_performance = pd.read_csv(
self.experiment_path + '/' + self.train_name + '/replication/' + self.datasets[
m] + "/model_evaluation/Summary_performance_mean.csv")
summary_performance['ROC AUC'] = summary_performance['ROC AUC'].astype(float)
highest_roc = summary_performance['ROC AUC'].max()
algorithm = summary_performance[summary_performance['ROC AUC'] == highest_roc].index.values
best_alg_roc = summary_performance.iloc[algorithm, 0]
summary_performance['Balanced Accuracy'] = summary_performance['Balanced Accuracy'].astype(float)
highest_ba = summary_performance['Balanced Accuracy'].max()
algorithm = summary_performance[summary_performance['Balanced Accuracy'] == highest_ba].index.values
best_alg_ba = summary_performance.iloc[algorithm, 0]
summary_performance['F1 Score'] = summary_performance['F1 Score'].astype(float)
highest_f1 = summary_performance['F1 Score'].max()
algorithm = summary_performance[summary_performance['F1 Score'] == highest_f1].index.values
best_alg_f1 = summary_performance.iloc[algorithm, 0]
summary_performance['PRC AUC'] = summary_performance['PRC AUC'].astype(float)
highest_prc = summary_performance['PRC AUC'].max()
algorithm = summary_performance[summary_performance['PRC AUC'] == highest_prc].index.values
best_alg_prc = summary_performance.iloc[algorithm, 0]
summary_performance['PRC APS'] = summary_performance['PRC APS'].astype(float)
highest_aps = summary_performance['PRC APS'].max()
algorithm = summary_performance[summary_performance['PRC APS'] == highest_aps].index.values
best_alg_aps = summary_performance.iloc[algorithm, 0]
self.analysis_report.x = 1
self.analysis_report.y = 85
self.analysis_report.set_font('Times', 'B', 8)
self.analysis_report.multi_cell(w=80, h=4, txt='Top ML Algorithm Results (Averaged Over CV Runs):',
border=1,
align='L')
self.analysis_report.set_font('Times', '', 8)
if len(best_alg_roc.values) > 1:
self.analysis_report.multi_cell(w=80, h=4,
txt="Best (ROC_AUC): " + str(
best_alg_roc.values[0]) + ' (TIE) = ' + str(
"{:.3f}".format(highest_roc)), border=1, align='L')
else:
self.analysis_report.multi_cell(w=80, h=4,
txt="Best (ROC_AUC): " + str(best_alg_roc.values[0]) + ' = ' + str(
"{:.3f}".format(highest_roc)), border=1, align='L')
if len(best_alg_ba.values) > 1:
self.analysis_report.multi_cell(w=80, h=4,
txt="Best (Balanced Acc.): " + str(
best_alg_ba.values[0]) + ' (TIE) = ' + str(
"{:.3f}".format(highest_ba)), border=1, align='L')
else:
self.analysis_report.multi_cell(w=80, h=4,
txt="Best (Balanced Acc.): " + str(best_alg_ba.values[0]) + ' = ' + str(
"{:.3f}".format(highest_ba)), border=1, align='L')
if len(best_alg_f1.values) > 1:
self.analysis_report.multi_cell(w=80, h=4,
txt="Best (F1 Score): " + str(
best_alg_f1.values[0]) + ' (TIE) = ' + str(
"{:.3f}".format(highest_f1)), border=1, align='L')
else:
self.analysis_report.multi_cell(w=80, h=4,
txt="Best (F1 Score): " + str(best_alg_f1.values[0]) + ' = ' + str(
"{:.3f}".format(highest_f1)), border=1, align='L')
if len(best_alg_prc.values) > 1:
self.analysis_report.multi_cell(w=80, h=4,
txt="Best (PRC AUC): " + str(
best_alg_prc.values[0]) + ' (TIE) = ' + str(
"{:.3f}".format(highest_prc)), border=1, align='L')
else:
self.analysis_report.multi_cell(w=80, h=4,
txt="Best (PRC AUC): " + str(best_alg_prc.values[0]) + ' = ' + str(
"{:.3f}".format(highest_prc)), border=1, align='L')
if len(best_alg_aps.values) > 1:
self.analysis_report.multi_cell(w=80, h=4,
txt="Best (PRC APS): " + str(
best_alg_aps.values[0]) + ' (TIE) = ' + str(
"{:.3f}".format(highest_aps)), border=1, align='L')
else:
self.analysis_report.multi_cell(w=80, h=4,
txt="Best (PRC APS): " + str(best_alg_aps.values[0]) + ' = ' + str(
"{:.3f}".format(highest_aps)), border=1, align='L')
# self.analysis_report.multi_cell(
# w=80, h=4,
# txt="Best (ROC_AUC): "
# + str(best_alg_roc.values) + ' = '
# + str("{:.3f}".format(highest_roc))
# + '\n' + "Best (Balanced Acc.): "
# + str(best_alg_ba.values)
# + ' = ' + str("{:.3f}".format(highest_ba))
# + '\n' + "Best (F1 Score): "
# + str(best_alg_f1.values) + ' = '
# + str("{:.3f}".format(highest_f1))
# + '\n' + "Best (PRC AUC): "
# + str(best_alg_prc.values) + ' = '
# + str("{:.3f}".format(highest_prc))
# + '\n' + "Best (PRC APS): "
# + str(best_alg_aps.values) + ' = '
# + str("{:.3f}".format(highest_aps)), border=1, align='L')
self.analysis_report.set_font('Times', 'B', 10)
# ROC
# -------------------------------
self.analysis_report.x = 1
self.analysis_report.y = 112
self.analysis_report.cell(10, 4, 'ROC', 1, align="L")
if self.training:
self.analysis_report.image(
self.experiment_path + '/' + self.datasets[m] + '/model_evaluation/Summary_ROC.png', 4, 118,
120)
self.analysis_report.image(
self.experiment_path + '/' + self.datasets[
m] + '/model_evaluation/metricBoxplots/Compare_ROC AUC.png', 124,
118,
82, 85)
else:
self.analysis_report.image(
self.experiment_path + '/' + self.train_name + '/replication/' + self.datasets[
m] + '/model_evaluation/Summary_ROC.png',
4, 118, 120)
self.analysis_report.image(
self.experiment_path + '/' + self.train_name + '/replication/' + self.datasets[
m] + '/model_evaluation/metricBoxplots/Compare_ROC AUC.png', 124, 118, 82, 85)
# PRC-------------------------------
self.analysis_report.x = 1
self.analysis_report.y = 200
self.analysis_report.cell(10, 4, 'PRC', 1, align="L")
if self.training:
self.analysis_report.image(
self.experiment_path + '/' + self.datasets[m] + '/model_evaluation/Summary_PRC.png', 4, 206,
133) # wider to account for more text
self.analysis_report.image(
self.experiment_path + '/' + self.datasets[
m] + '/model_evaluation/metricBoxplots/Compare_PRC AUC.png', 138,
205,
68, 80)
else:
self.analysis_report.image(
self.experiment_path + '/' + self.train_name + '/replication/' + self.datasets[
m] + '/model_evaluation/Summary_PRC.png',
4, 206, 133) # wider to account for more text
self.analysis_report.image(
self.experiment_path + '/' + self.train_name + '/replication/' + self.datasets[
m] + '/model_evaluation/metricBoxplots/Compare_PRC AUC.png', 138, 205, 68, 80)
self.footer()
# NEXT PAGE(S) - Average Model Prediction Statistics
# --------------------------------------------------------------------------------------
logging.info("Publishing Average Model Prediction Statistics")
result_limit = 5 # Limits to this many dataset results per page
dataset_count = len(self.datasets)
# Determine number of pages needed for univariate results
page_count = dataset_count / float(result_limit)
page_count = math.ceil(page_count) # rounds up to next full integer
self.analysis_report.set_fill_color(200)
for page in range(0, page_count): # generate each page
self.pub_model_mean_stats(page, result_limit, page_count)
# NEXT PAGE(S) - Median Model Prediction Statistics
# --------------------------------------------------------------------------------------
logging.info("Publishing Median Model Prediction Statistics")
result_limit = 5 # Limits to this many dataset results per page
dataset_count = len(self.datasets)
# Determine number of pages needed for univariate results
page_count = dataset_count / float(result_limit)
page_count = math.ceil(page_count) # rounds up to next full integer
self.analysis_report.set_fill_color(200)
for page in range(0, page_count): # generate each page
self.pub_model_median_stats(page, result_limit, page_count)
# NEXT PAGE(S) - ML Dataset Feature Importance Summary
# ----------------------------------------------------------------
if self.training:
logging.info("Publishing Feature Importance Summaries")
for k in range(len(self.datasets)):
self.analysis_report.add_page()
self.analysis_report.set_font('Times', 'B', 12)
self.analysis_report.cell(w=0, h=8,
txt="Feature Importance Summary: D" + str(k + 1) + ' = ' + self.datasets[k],
border=1, align="L", ln=2)
self.analysis_report.set_font(family='times', size=9)
self.analysis_report.image(
self.experiment_path + '/' + self.datasets[
k] + '/feature_selection/mutual_information/TopAverageScores.png',
5,
12, 100, 135) # Images adjusted to fit a width of 100 and length of 135
self.analysis_report.image(
self.experiment_path + '/' + self.datasets[k] + '/feature_selection/multisurf/TopAverageScores.png',
105, 12,
100,
135)
self.analysis_report.x = 0
self.analysis_report.y = 150
self.analysis_report.cell(0, 8,
"Composite Feature Importance Plot (Normalized and Performance Weighted)", 1,
align="L")
self.analysis_report.image(
self.experiment_path + '/' + self.datasets[
k] + '/model_evaluation/feature_importance/Compare_FI_Norm_Weight.png',
1, 159, 208, 125) # 130 added
self.footer()
# NEXT PAGE - Create Dataset Boxplot Comparison Page
# ---------------------------------------
if self.training:
logging.info("Publishing Dataset Comparison Boxplots")
self.analysis_report.add_page()
self.analysis_report.set_font('Times', 'B', 12)
self.analysis_report.cell(w=0, h=8, txt="Compare ML Performance Across Datasets", border=1, align="L",
ln=2)
self.analysis_report.set_font(family='times', size=9)
if len(self.datasets) > 1:
self.analysis_report.image(
self.experiment_path + '/DatasetComparisons/dataCompBoxplots/' + 'DataCompareAllModels_ROC AUC.png',
1,
12,
208, 130) # Images adjusted to fit a width of 100 and length of 135
self.analysis_report.image(
self.experiment_path + '/DatasetComparisons/dataCompBoxplots/' + 'DataCompareAllModels_PRC AUC.png',
1,
150,
208, 130) # Images adjusted to fit a width of 100 and length of 135
self.footer()
# NEXT PAGE(S) -Create Best Kruskall Wallis Dataset Comparison Page
# ---------------------------------------
if self.training:
logging.info("Publishing Statistical Analysis")
self.analysis_report.add_page(orientation='P')
self.analysis_report.set_margins(left=1, top=10, right=1, )
d = []
for i in range(len(self.datasets)):
d.append('Data ' + str(i + 1) + '= ' + self.datasets[i])
d.append('\n')
self.analysis_report.set_font('Times', 'B', 12)
if len(self.datasets) < 19:
self.analysis_report.cell(w=0, h=8,
txt='Using Best Performing Algorithms (Kruskall Wallis Compare Datasets)',
border=1, align="L", ln=2)
else:
self.analysis_report.cell(w=0, h=8,
txt='Using Best Performing Algorithms (Kruskall Wallis Compare Datasets): Page 1',
border=1, align="L", ln=2)
self.analysis_report.set_font(family='times', size=7)
# Dataset list Key
#list_datasets = ''
#i = 1
#for each in self.datasets:
# list_datasets = list_datasets + ('D' + str(i) + ' = ' + str(each) + '\n')
# i += 1
#self.analysis_report.x = 5
#self.analysis_report.y = 14
#self.analysis_report.multi_cell(w=0, h=4, txt='Datasets: ' + '\n' + list_datasets, border=1, align='L')
self.analysis_report.y += 2
success = False
kruskal_wallis_datasets = None
try:
# Kruskal Wallis Table
# A table can take at most 4 self.datasets to fit comfortably with these settings
kruskal_wallis_datasets = pd.read_csv(self.experiment_path + '/DatasetComparisons/' +
'BestCompare_KruskalWallis.csv', sep=',', index_col=0)
kruskal_wallis_datasets = kruskal_wallis_datasets.round(4)
success = True
except Exception:
pass
if success:
# Process
# for i in range(len(self.datasets)):
# kruskal_wallis_datasets = kruskal_wallis_datasets.drop('Std_D'+str(i+1),1)
kruskal_wallis_datasets = kruskal_wallis_datasets.drop('Statistic', axis=1)
kruskal_wallis_datasets = kruskal_wallis_datasets.drop('Sig(*)', axis=1)
# Format
kruskal_wallis_datasets.reset_index(inplace=True)
temp_df = pd.concat([kruskal_wallis_datasets.columns.to_frame().T, kruskal_wallis_datasets])
temp_df.iloc[0, 0] = 'Metrics'
kruskal_wallis_datasets = temp_df
kruskal_wallis_datasets.columns = range(len(kruskal_wallis_datasets.columns))
# epw = 208 # Amount of Space (width) Available
th = self.analysis_report.font_size
# col_width = epw/float(10) #maximum column width
col_width_list = [23, 12, 30, 16, 30, 16, 30, 16]
if len(self.datasets) <= 3: # 4
col_count = 0
kruskal_wallis_datasets = kruskal_wallis_datasets.to_numpy()
for row in kruskal_wallis_datasets:
for datum in row:
self.analysis_report.cell(col_width_list[col_count], th, str(datum), border=1)
col_count += 1
col_count = 0
self.analysis_report.ln(th) # critical
else:
# Print next 3 self.datasets
col_count = 0
table1 = kruskal_wallis_datasets.iloc[:, :8] # 10
table1 = table1.to_numpy()
for row in table1:
for datum in row:
self.analysis_report.cell(col_width_list[col_count], th, str(datum), border=1)
col_count += 1
col_count = 0
self.analysis_report.ln(th) # critical
self.analysis_report.y += 2
col_count = 0
table1 = kruskal_wallis_datasets.iloc[:, 8:14] # 10:18
met = kruskal_wallis_datasets.iloc[:, 0]
met2 = kruskal_wallis_datasets.iloc[:, 1]
table1 = pd.concat([met, met2, table1], axis=1)
table1 = table1.to_numpy()
for row in table1:
for datum in row:
self.analysis_report.cell(col_width_list[col_count], th, str(datum), border=1)
col_count += 1
col_count = 0
self.analysis_report.ln(th) # critical
self.analysis_report.y += 2
if len(self.datasets) > 6: # 8
col_count = 0
table1 = kruskal_wallis_datasets.iloc[:, 14:20] # 18:26
met = kruskal_wallis_datasets.iloc[:, 0]
met2 = kruskal_wallis_datasets.iloc[:, 1]
table1 = pd.concat([met, met2, table1], axis=1)
table1 = table1.to_numpy()
for row in table1:
for datum in row:
self.analysis_report.cell(col_width_list[col_count], th, str(datum), border=1)
col_count += 1
col_count = 0
self.analysis_report.ln(th) # critical
self.analysis_report.y += 2
if len(self.datasets) > 9:
table1 = kruskal_wallis_datasets.iloc[:, 20:26]
met = kruskal_wallis_datasets.iloc[:, 0]
met2 = kruskal_wallis_datasets.iloc[:, 1]
table1 = pd.concat([met, met2, table1], axis=1)
table1 = table1.to_numpy()
for row in table1:
for datum in row:
self.analysis_report.cell(col_width_list[col_count], th, str(datum), border=1)
col_count += 1
col_count = 0
self.analysis_report.ln(th) # critical
self.analysis_report.y += 2
if len(self.datasets) > 12:
table1 = kruskal_wallis_datasets.iloc[:, 26:32]
met = kruskal_wallis_datasets.iloc[:, 0]
met2 = kruskal_wallis_datasets.iloc[:, 1]
table1 = pd.concat([met, met2, table1], axis=1)
table1 = table1.to_numpy()
for row in table1:
for datum in row:
self.analysis_report.cell(col_width_list[col_count], th, str(datum), border=1)
col_count += 1
col_count = 0
self.analysis_report.ln(th) # critical
self.analysis_report.y += 2
if len(self.datasets) > 15:
table1 = kruskal_wallis_datasets.iloc[:, 32:38]
met = kruskal_wallis_datasets.iloc[:, 0]
met2 = kruskal_wallis_datasets.iloc[:, 1]
table1 = pd.concat([met, met2, table1], axis=1)
table1 = table1.to_numpy()
for row in table1:
for datum in row:
self.analysis_report.cell(col_width_list[col_count], th, str(datum), border=1)
col_count += 1
col_count = 0
self.analysis_report.ln(th) # critical
self.analysis_report.y += 2
if len(self.datasets) > 18:
self.footer()
self.analysis_report.add_page(orientation='P')
self.analysis_report.set_margins(left=1, top=10, right=1, )
self.analysis_report.set_font('Times', 'B', 12)
self.analysis_report.cell(w=0, h=8,
txt='Using Best Performing Algorithms (Kruskall Wallis Compare Datasets): Page 2',
border=1, align="L", ln=2)
self.analysis_report.set_font(family='times', size=7)
self.analysis_report.y += 2
col_count = 0
table1 = kruskal_wallis_datasets.iloc[:, 38:44] # 18:26
met = kruskal_wallis_datasets.iloc[:, 0]
met2 = kruskal_wallis_datasets.iloc[:, 1]
table1 = pd.concat([met, met2, table1], axis=1)
table1 = table1.to_numpy()
for row in table1:
for datum in row:
self.analysis_report.cell(col_width_list[col_count], th, str(datum), border=1)
col_count += 1
col_count = 0
self.analysis_report.ln(th) # critical
self.analysis_report.y += 2
if len(self.datasets) > 21:
table1 = kruskal_wallis_datasets.iloc[:, 44:50]
met = kruskal_wallis_datasets.iloc[:, 0]
met2 = kruskal_wallis_datasets.iloc[:, 1]
table1 = pd.concat([met, met2, table1], axis=1)
table1 = table1.to_numpy()
for row in table1:
for datum in row:
self.analysis_report.cell(col_width_list[col_count], th, str(datum), border=1)
col_count += 1
col_count = 0
self.analysis_report.ln(th) # critical
self.analysis_report.y += 2
if len(self.datasets) > 24:
table1 = kruskal_wallis_datasets.iloc[:, 50:56]
met = kruskal_wallis_datasets.iloc[:, 0]
met2 = kruskal_wallis_datasets.iloc[:, 1]
table1 = pd.concat([met, met2, table1], axis=1)
table1 = table1.to_numpy()
for row in table1:
for datum in row:
self.analysis_report.cell(col_width_list[col_count], th, str(datum), border=1)
col_count += 1
col_count = 0
self.analysis_report.ln(th) # critical
self.analysis_report.y += 2
if len(self.datasets) > 27:
table1 = kruskal_wallis_datasets.iloc[:, 56:62]
met = kruskal_wallis_datasets.iloc[:, 0]
met2 = kruskal_wallis_datasets.iloc[:, 1]
table1 = pd.concat([met, met2, table1], axis=1)
table1 = table1.to_numpy()
for row in table1:
for datum in row:
self.analysis_report.cell(col_width_list[col_count], th, str(datum), border=1)
col_count += 1
col_count = 0
self.analysis_report.ln(th) # critical
self.analysis_report.y += 2
if len(self.datasets) > 30:
table1 = kruskal_wallis_datasets.iloc[:, 62:68]
met = kruskal_wallis_datasets.iloc[:, 0]
met2 = kruskal_wallis_datasets.iloc[:, 1]
table1 = pd.concat([met, met2, table1], axis=1)
table1 = table1.to_numpy()
for row in table1:
for datum in row:
self.analysis_report.cell(col_width_list[col_count], th, str(datum), border=1)
col_count += 1
col_count = 0
self.analysis_report.ln(th) # critical
self.analysis_report.y += 2
if len(self.datasets) > 33:
table1 = kruskal_wallis_datasets.iloc[:, 68:74]
met = kruskal_wallis_datasets.iloc[:, 0]
met2 = kruskal_wallis_datasets.iloc[:, 1]
table1 = pd.concat([met, met2, table1], axis=1)
table1 = table1.to_numpy()
for row in table1:
for datum in row:
self.analysis_report.cell(col_width_list[col_count], th, str(datum), border=1)
col_count += 1
col_count = 0
self.analysis_report.ln(th) # critical
self.analysis_report.y += 2
if len(self.datasets) > 36:
self.analysis_report.x = 0
self.analysis_report.y = 280
self.analysis_report.cell(0, 4, 'A maximum of 36 dataset results could be displayed', 1,
align="C")
self.footer()
#self.footer()
# LAST PAGE - Create Runtime Summary Page---------------------------------------
if self.training:
logging.info("Publishing Runtime Summary")
result_limit = 6 # Limits to this many dataset results per page
dataset_count = len(self.datasets)
# Determine number of pages needed for univariate results
page_count = dataset_count / float(result_limit)
page_count = math.ceil(page_count) # rounds up to next full integer
for page in range(0, page_count): # generate each page
self.pub_runtime(page, result_limit, page_count)
# Output The PDF Object
try:
if self.training:
file_name = str(self.experiment_name) + '_STREAMLINE_Report.pdf'
self.analysis_report.output(self.experiment_path + '/' + file_name)
# Print phase completion
logging.info("Phase 8 complete")
try:
job_file = open(self.experiment_path + '/jobsCompleted/job_data_pdf_training.txt', 'w')
job_file.write('complete')
job_file.close()
except Exception:
pass
else:
file_name = str(self.experiment_name) + '_STREAMLINE_Replication_Report.pdf'
self.analysis_report.output(
self.experiment_path + '/' + self.train_name + '/replication/' + self.datasets[M] + '/' + file_name)
# Print phase completion
logging.info("Phase 10 complete")
try:
job_file = open(self.experiment_path + '/jobsCompleted/job_data_pdf_apply_' + str(
self.train_name) + '.txt',
'w')
job_file.write('complete')
job_file.close()
except Exception:
pass
except Exception:
logging.info('Pdf Output Failed')
[docs]
def pub_univariate(self, page, result_limit, page_count):
""" Generates single page of univariate analysis results. Automatically moves to another page when runs out of
space. Maximum of 4 dataset results to a page."""
dataset_count = len(self.datasets)
data_start = page * result_limit
count_limit = (page * result_limit) + result_limit
self.analysis_report.add_page(orientation='P')
self.analysis_report.set_font('Times', 'B', 12)
if page_count > 1:
self.analysis_report.cell(w=180, h=8,
txt='Univariate Analysis of Each Dataset (Top 10 Features for Each): Page ' + str(
page + 1),
border=1, align='L', ln=2)
else:
self.analysis_report.cell(w=180, h=8, txt='Univariate Analysis of Each Dataset (Top 10 Features for Each)',
border=1,
align='L', ln=2)
try:
# Try loop added to deal with versions specific change to using mannwhitneyu in scipy and
# avoid STREAMLINE crash in those circumstances.
for n in range(data_start, dataset_count):
if n >= count_limit: # Stops generating page when dataset count limit reached
break
self.analysis_report.y += 2
sig_df = pd.read_csv(
self.experiment_path + '/' + self.datasets[
n] + '/exploratory/univariate_analyses/Univariate_Significance.csv')
sig_ls = []
sig_df = sig_df.nsmallest(10, ['p-value'])
self.analysis_report.set_font('Times', 'B', 10)
self.analysis_report.multi_cell(w=160, h=6, txt='D' + str(n + 1) + ' = ' + self.datasets[n], border=0,
align='L')
# for i in range(len(sig_df)):
# sig_ls.append(sig_df.iloc[i, 0] + '\t\t\t: ')
# sig_ls.append(str(sig_df.iloc[i, 1]))
# sig_ls.append('\t\t\t' + '(' + sig_df.iloc[i, 3] + ',' + str(sig_df.iloc[i, 2]) + ')' + '\n')
# self.analysis_report.set_font('Times', 'B', 10)
# self.analysis_report.multi_cell(w=180, h=4, txt='D' + str(n + 1) + ' = ' + self.datasets[n], border=1,
# align='L')
# self.analysis_report.y += 1 # Space below section header
# self.analysis_report.set_font('Times', 'B', 8)
# self.analysis_report.multi_cell(w=180, h=4,
# txt='Feature: \t\t\t P-Value \t\t\t (Test, test statistics)', border=1,
# align='L')
# self.analysis_report.set_font('Times', '', 8)
# self.analysis_report.multi_cell(w=180, h=4, txt=' ' + list_to_string(sig_ls), border=1, align='L')
self.analysis_report.set_font('Times', '', 8)
# sig_df = sig_df.round(3)
# Format
sig_df.reset_index(inplace=True)
sig_df = pd.concat([sig_df.columns.to_frame().T, sig_df])
sig_df.columns = range(len(sig_df.columns))
th = self.analysis_report.font_size
col_width_list = [40, 40, 40, 40, 40, 20]
table1 = sig_df.iloc[:, :]
table1 = table1.to_numpy()
# Print table header first
row_count = 0
col_count = 0
for row in table1: # each row
for datum in row:
entry_list = str(datum).split(' ')
try:
if col_count == 0:
pass
else:
self.analysis_report.cell(col_width_list[col_count], th, str(datum), border=1,
align="C")
except Exception:
self.analysis_report.cell(col_width_list[col_count], th, ' ', border=1, align="C")
col_count += 1
self.analysis_report.ln(th) # critical
col_count = 0
row_count += 1
except Exception as e:
self.analysis_report.x = 5
self.analysis_report.y = 40
self.analysis_report.cell(180, 4,
# 'WARNING: Univariate analysis failed from scipy package error. To fix: pip '
# 'install --upgrade scipy',
str(e),
1, align="L")
self.footer()
[docs]
def pub_model_mean_stats(self, page, result_limit, page_count):
dataset_count = len(self.datasets)
data_start = page * result_limit
count_limit = (page * result_limit) + result_limit
# Create PDF and Set Options
self.analysis_report.set_margins(left=1, top=1, right=1, )
self.analysis_report.add_page()
self.analysis_report.set_font('Times', 'B', 12)
if page_count > 1:
self.analysis_report.cell(w=0, h=8,
txt='Average Model Prediction Statistics (Rounded to 3 Decimal Points): Page '
+ str(page + 1), border=1, align='L', ln=2)
else:
self.analysis_report.cell(w=0, h=8, txt='Average Model Prediction Statistics (Rounded to 3 Decimal Points)',
border=1,
align='L', ln=2)
for n in range(data_start, dataset_count):
if n >= count_limit:
# Stops generating page when dataset count limit reached
break
self.analysis_report.y += 4
self.analysis_report.set_font('Times', 'B', 10)
self.analysis_report.multi_cell(w=0, h=4, txt='D' + str(n + 1) + ' = ' + self.datasets[n], border=1,
align='L')
self.analysis_report.y += 1 # Space below section header
self.analysis_report.set_font('Times', '', 7)
if self.training:
stats_ds = pd.read_csv(
self.experiment_path + '/' + str(
self.datasets[n]) + '/model_evaluation/Summary_performance_mean.csv',
sep=',',
index_col=0)
else:
stats_ds = pd.read_csv(self.experiment_path + '/' + self.train_name + '/replication/' + self.datasets[
n] + '/model_evaluation/Summary_performance_mean.csv', sep=',', index_col=0)
# Make list of top values for each metric
metric_name_list = ['Balanced Accuracy', 'Accuracy', 'F1 Score', 'Sensitivity (Recall)', 'Specificity',
'Precision (PPV)', 'TP', 'TN', 'FP', 'FN', 'NPV', 'LR+', 'LR-', 'ROC AUC', 'PRC AUC',
'PRC APS']
best_metric_list = []
if self.training:
ds2 = pd.read_csv(
self.experiment_path + '/' + self.datasets[n] + "/model_evaluation/Summary_performance_mean.csv")
else:
ds2 = pd.read_csv(self.experiment_path + '/' + self.train_name + '/replication/' + self.datasets[
n] + '/model_evaluation/Summary_performance_mean.csv')
self.format_fn(stats_ds, best_metric_list, metric_name_list, ds2)
self.footer()
[docs]
def pub_runtime(self, page, result_limit, page_count):
"""
Generates single page of runtime analysis results. Automatically moves to another page when runs out of
space. Maximum of 4 dataset results to a page.
"""
col_width_1 = 45 # maximum column width
col_width_2 = 25
dataset_count = len(self.datasets)
data_start = page * result_limit
count_limit = (page * result_limit) + result_limit
self.analysis_report.add_page(orientation='P')
self.analysis_report.set_font('Times', 'B', 12)
if page_count > 1:
self.analysis_report.cell(w=0, h=8, txt='Pipeline Runtime Summary: Page ' + str(page + 1), border=1,
align='L',
ln=2)
else:
self.analysis_report.cell(w=0, h=8, txt='Pipeline Runtime Summary', border=1, align='L', ln=2)
self.analysis_report.set_font('Times', '', 8)
th = self.analysis_report.font_size
self.analysis_report.y += 2
left = True
for n in range(data_start, dataset_count):
if n >= count_limit: # Stops generating page when dataset count limit reached
break
last_y = self.analysis_report.y
last_x = self.analysis_report.x
if left:
self.analysis_report.x = 1
time_df = pd.read_csv(self.experiment_path + '/' + self.datasets[n] + '/runtimes.csv')
time_df.iloc[:, 1] = time_df.iloc[:, 1].round(2)
time_df = pd.concat([time_df.columns.to_frame().T, time_df])
time_df = time_df.to_numpy()
self.analysis_report.set_font('Times', 'B', 10)
self.analysis_report.cell(col_width_1 + col_width_2 * 2, 4, str(self.datasets[n]), 1, align="L")
self.analysis_report.y += 5
self.analysis_report.x = last_x
self.analysis_report.set_font('Times', '', 7)
for row in time_df:
col = 0
for datum in row:
if col == 0:
self.analysis_report.cell(col_width_1, th, str(datum), border=1)
else:
self.analysis_report.cell(col_width_2, th, str(datum), border=1)
col +=1
self.analysis_report.ln(th) # critical
self.analysis_report.x = last_x
if left:
self.analysis_report.x = (col_width_1 + col_width_2 * 2) + 2
self.analysis_report.y = last_y
left = False
else:
self.analysis_report.x = 1
self.analysis_report.y = last_y + 70
left = True
self.footer()
[docs]
def list_to_string(s):
"""Convert a list of string to string"""
str1 = " "
return str1.join(s)
[docs]
def ngi(list1, n):
"""Find N the greatest integers within a list"""
final_list = []
for i in range(0, n):
max1 = 0
for j in range(len(list1)):
if list1[j] > max1:
max1 = list1[j]
list1.remove(max1)
final_list.append(max1)