Source code for streamline.modeling.utils

import os
import pickle
import logging
import pandas as pd
import multiprocessing
from streamline.modeling.load_models import load_class_from_folder

num_cores = int(os.environ.get('SLURM_CPUS_PER_TASK', multiprocessing.cpu_count()))

SUPPORTED_MODELS_OBJ = load_class_from_folder()

SUPPORTED_MODELS = [m.model_name for m in SUPPORTED_MODELS_OBJ]

# logging.warning(SUPPORTED_MODELS)


SUPPORTED_MODELS_SMALL = [m.small_name for m in SUPPORTED_MODELS_OBJ]

COLOR_LIST = [m.color for m in SUPPORTED_MODELS_OBJ]

MODEL_DICT = dict(zip(SUPPORTED_MODELS + SUPPORTED_MODELS_SMALL,
                      SUPPORTED_MODELS_OBJ + SUPPORTED_MODELS_OBJ))

LABELS = dict(zip(SUPPORTED_MODELS + SUPPORTED_MODELS_SMALL,
                  SUPPORTED_MODELS + SUPPORTED_MODELS))

ABBREVIATION = dict(zip(SUPPORTED_MODELS, SUPPORTED_MODELS_SMALL))

COLORS = dict(zip(SUPPORTED_MODELS, COLOR_LIST))



[docs]
def is_supported_model(string):
    try:
        return LABELS[string]
    except KeyError:
        raise Exception("Unknown Model")




[docs]
def model_str_to_obj(string):
    assert is_supported_model(string)
    return MODEL_DICT[string]




[docs]
def get_fi_for_ExSTraCS(output_path, experiment_name, dataset_name, class_label, instance_label,
                        cv, filter_poor_features):
    """
    For ExSTraCS, gets the MultiSURF (or MI if MS not available) FI scores for the feature subset being analyzed
    here in modeling
    """
    scores = []  # to be filled in, in fitted dataset order.
    full_path = output_path + '/' + experiment_name + '/' + dataset_name
    # If MultiSURF was done previously
    if os.path.exists(full_path + "/feature_selection/multisurf/pickledForPhase4/"):
        algorithm_label = 'multisurf'
    elif os.path.exists(full_path + "/feature_selection/mutual_information/pickledForPhase4/"):
        # If MI was done previously and MS wasn't:
        algorithm_label = 'mutual_information'
    else:
        scores = None
        return scores

    if filter_poor_features:
        # obtain feature importance scores for feature subset analyzed (in correct training dataset order)
        # Load current data ordered_feature_names
        header = pd.read_csv(
            full_path + '/CVDatasets/' + dataset_name + '_CV_' + str(cv) + '_Test.csv').columns.values.tolist()
        if instance_label is not None:
            header.remove(instance_label)
        header.remove(class_label)
        # Load original dataset multisurf scores
        score_info = full_path + "/feature_selection/" + algorithm_label + "/pickledForPhase4/" + str(cv) + '.pickle'
        file = open(score_info, 'rb')
        raw_data = pickle.load(file)
        file.close()
        score_dict = raw_data[1]
        # Generate filtered multisurf score list with same order as working datasets
        for each in header:
            scores.append(score_dict[each])
    else:  # obtain feature importance scores for all features (i.e. no feature selection was conducted)
        # Load original dataset multisurf scores
        score_info = full_path + "/feature_selection/" + algorithm_label + "/pickledForPhase4/" + str(cv) + '.pickle'
        file = open(score_info, 'rb')
        raw_data = pickle.load(file)
        file.close()
        scores = raw_data[0]
    return scores