Source code for streamline.modeling.utils

import os
import pickle
import logging
import pandas as pd
import multiprocessing
from streamline.modeling.load_models import load_class_from_folder

num_cores = int(os.environ.get('SLURM_CPUS_PER_TASK', multiprocessing.cpu_count()))

SUPPORTED_MODELS_OBJ = load_class_from_folder()

SUPPORTED_MODELS = [m.model_name for m in SUPPORTED_MODELS_OBJ]

# logging.warning(SUPPORTED_MODELS)


SUPPORTED_MODELS_SMALL = [m.small_name for m in SUPPORTED_MODELS_OBJ]

COLOR_LIST = [m.color for m in SUPPORTED_MODELS_OBJ]

MODEL_DICT = dict(zip(SUPPORTED_MODELS + SUPPORTED_MODELS_SMALL,
                      SUPPORTED_MODELS_OBJ + SUPPORTED_MODELS_OBJ))

LABELS = dict(zip(SUPPORTED_MODELS + SUPPORTED_MODELS_SMALL,
                  SUPPORTED_MODELS + SUPPORTED_MODELS))

ABBREVIATION = dict(zip(SUPPORTED_MODELS, SUPPORTED_MODELS_SMALL))

COLORS = dict(zip(SUPPORTED_MODELS, COLOR_LIST))


[docs] def is_supported_model(string): try: return LABELS[string] except KeyError: raise Exception("Unknown Model")
[docs] def model_str_to_obj(string): assert is_supported_model(string) return MODEL_DICT[string]
[docs] def get_fi_for_ExSTraCS(output_path, experiment_name, dataset_name, class_label, instance_label, cv, filter_poor_features): """ For ExSTraCS, gets the MultiSURF (or MI if MS not available) FI scores for the feature subset being analyzed here in modeling """ scores = [] # to be filled in, in fitted dataset order. full_path = output_path + '/' + experiment_name + '/' + dataset_name # If MultiSURF was done previously if os.path.exists(full_path + "/feature_selection/multisurf/pickledForPhase4/"): algorithm_label = 'multisurf' elif os.path.exists(full_path + "/feature_selection/mutual_information/pickledForPhase4/"): # If MI was done previously and MS wasn't: algorithm_label = 'mutual_information' else: scores = None return scores if filter_poor_features: # obtain feature importance scores for feature subset analyzed (in correct training dataset order) # Load current data ordered_feature_names header = pd.read_csv( full_path + '/CVDatasets/' + dataset_name + '_CV_' + str(cv) + '_Test.csv').columns.values.tolist() if instance_label is not None: header.remove(instance_label) header.remove(class_label) # Load original dataset multisurf scores score_info = full_path + "/feature_selection/" + algorithm_label + "/pickledForPhase4/" + str(cv) + '.pickle' file = open(score_info, 'rb') raw_data = pickle.load(file) file.close() score_dict = raw_data[1] # Generate filtered multisurf score list with same order as working datasets for each in header: scores.append(score_dict[each]) else: # obtain feature importance scores for all features (i.e. no feature selection was conducted) # Load original dataset multisurf scores score_info = full_path + "/feature_selection/" + algorithm_label + "/pickledForPhase4/" + str(cv) + '.pickle' file = open(score_info, 'rb') raw_data = pickle.load(file) file.close() scores = raw_data[0] return scores