Source code for streamline.utils.cleanup

import sys
import os
import shutil
import glob
import argparse
from pathlib import Path


[docs] class Cleaner: """ Phase 11 of STREAMLINE (Optional)- This 'Main' script runs Phase 11 which deletes all temporary files in pipeline output folder. This script is not necessary to run, but serves as a convenience to reduce space and clutter following a pipeline run. """ def __init__(self, output_path, experiment_name, del_time=True, del_old_cv=True): """ Cleaner Class Args: output_path: path to output directory experiment_name: name of experiment output folder (no spaces) del_time: delete individual run-time files (but save summary), default=True del_old_cv: delete any of the older versions of CV training and \ testing datasets not overwritten (preserves final training and testing datasets, default=True """ self.output_path = output_path self.experiment_name = experiment_name self.experiment_path = self.output_path + '/' + self.experiment_name self.del_time = del_time self.del_old_cv = del_old_cv if self.del_time == 'False' or self.del_time == False: self.del_time == False else: self.del_time == True if self.del_old_cv == 'False' or self.del_old_cv == False: self.del_old_cv == False else: self.del_old_cv == True if not os.path.exists(self.output_path): raise Exception("Provided output_path does not exist") if not os.path.exists(self.experiment_path): raise Exception("Provided experiment name in given output_path does not exist")
[docs] def run(self): # Get dataset paths for all completed dataset analyses in experiment folder datasets = os.listdir(self.experiment_path) remove_list = ['.DS_Store', 'metadata.pickle', 'metadata.csv', 'algInfo.pickle', 'DatasetComparisons', 'jobs', 'jobsCompleted', 'logs', 'KeyFileCopy', 'dask_logs', self.experiment_name + '_STREAMLINE_Report.pdf'] for text in remove_list: if text in datasets: datasets.remove(text) # Delete log folder/files self.rm_tree(self.experiment_path + '/' + 'logs') # Delete job folder/files self.rm_tree(self.experiment_path + '/' + 'jobs') # Delete jobscompleted folder/files self.rm_tree(self.experiment_path + '/' + 'jobsCompleted') # Remake folders (empty) incase user wants to rerun scripts like pdf report from command line os.mkdir(self.experiment_path + '/jobsCompleted') os.mkdir(self.experiment_path + '/jobs') os.mkdir(self.experiment_path + '/logs') # Delete target files within each dataset subfolder for dataset in datasets: # Delete individual runtime files (save runtime summary generated in phase 6) if self.del_time: self.rm_tree(self.experiment_path + '/' + dataset + '/' + 'runtime') # Delete temporary feature importance pickle files # (only needed for phase 4 and then saved as summary files in phase 6) self.rm_tree(self.experiment_path + '/' + dataset + '/feature_selection/mutualinformation/pickledForPhase4') self.rm_tree(self.experiment_path + '/' + dataset + '/feature_selection/multisurf/pickledForPhase4') # Delete older training and testing CV datasets (does not delete any # final versions used for training). Older cv datasets might have been # kept to see what they look like prior to preprocessing and feature selection. if self.del_old_cv: # Delete CV files generated after preprocessing but before feature selection files = glob.glob(self.experiment_path + '/' + dataset + '/CVDatasets/*CVOnly*') files = [str(Path(path)) for path in files] for f in files: self.rm_tree(f, False) # Delete CV files generated after CV partitioning but before preprocessing files = glob.glob(self.experiment_path + '/' + dataset + '/CVDatasets/*CVPre*') files = [str(Path(path)) for path in files] for f in files: self.rm_tree(f, False)
[docs] @staticmethod def rm_tree(path, folder=True): try: if folder: if os.path.exists(path): shutil.rmtree(path) else: os.remove(path) except Exception: pass
if __name__ == '__main__': parser = argparse.ArgumentParser(description="") # No defaults parser.add_argument('--out-path', dest='output_path', type=str, help='path to output directory') parser.add_argument('--exp-name', dest='experiment_name', type=str, help='name of experiment output folder (no spaces)') parser.add_argument('--del-time', dest='del_time', type=str, help='delete individual run-time files (but save summary)', default="True") parser.add_argument('--del-oldCV', dest='del_old_cv', type=str, help='delete any of the older versions of CV training and testing datasets not overwritten (' 'preserves final training and testing datasets)', default="True") options = parser.parse_args(sys.argv[1:]) cleaner = Cleaner(options.output_path, options.experiment_name, options.del_time, options.del_old_cv)