Source code for libauc.utils.utils

import torch 
import numpy as np
import datetime
import os
import sys
import time
import random
import shutil
import numpy as np
from collections import Counter
from tqdm import tqdm, trange

[docs] def set_all_seeds(SEED): # for reproducibility torch.manual_seed(SEED) np.random.seed(SEED) random.seed(SEED) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False
[docs] def check_tensor_shape(tensor, shape): # check tensor shape if not torch.is_tensor(tensor): raise ValueError('Input is not a valid torch tensor!') if not isinstance(shape, (tuple, list, int)): raise ValueError("Shape must be a tuple, an integer or a list!") if isinstance(shape, int): shape = torch.Size([shape]) tensor_shape = tensor.shape if len(tensor_shape) != len(shape): tensor = tensor.reshape(shape) return tensor
[docs] def check_array_type(array): # convert to array type if not isinstance(array, (np.ndarray, np.generic)): array = np.array(array) return array
[docs] def check_array_shape(array, shape): # check array shape array = check_array_type(array) if array.size == 0: raise ValueError("Array is empty.") if array.shape != shape and len(array.shape) != 1: try: array = array.reshape(shape) except ValueError as e: raise ValueError(f"Could not reshape array of shape {array.shape} to {shape}.") from e return array
[docs] def check_class_labels(labels): # check if labels are valid labels = check_array_type(labels) unique_values = np.unique(labels) num_classes = len(unique_values) if not np.all(unique_values == np.arange(num_classes)): raise ValueError("Labels should be integer values starting from 0.")
[docs] def select_mean(array, threshold=0): # select elements for average based on threshold array = check_array_type(array) select_array = array[array >= threshold] if len(select_array) != 0: return np.mean(select_array) else: return None
[docs] def check_imbalance_ratio(labels): # check data imbalance ratio for the labels labels = check_array_type(labels) check_class_labels(labels) # Flatten the labels array if it's 2D (n, 1) if len(labels.shape) > 1 and labels.shape[1] == 1: labels = labels.flatten() num_samples = len(labels) class_counts = Counter(labels) for class_label, count in class_counts.items(): class_ratio = count / num_samples print (f'#SAMPLES: {num_samples}, CLASS {class_label:.1f} COUNT: {count}, CLASS RATIO: {class_ratio:.4f}')
[docs] def get_time(): return datetime.datetime.now().strftime("%Y-%m-%d-%H:%M:%S")
[docs] class ImbalancedDataGenerator(object): def __init__(self, imratio=None, shuffle=True, random_seed=0, verbose=False): self.imratio = imratio self.shuffle = shuffle self.random_seed = random_seed self.verbose = verbose @staticmethod def _get_split_index(num_classes): split_index = num_classes // 2 - 1 if split_index < 0: raise NotImplementedError return split_index @staticmethod def _get_class_num(targets): return np.unique(targets).size
[docs] def transform(self, data, targets, imratio=None): data = check_array_type(data) targets = check_array_type(targets) targets = np.maximum(targets, 0) if imratio is not None: self.imratio = imratio if self.imratio is None: raise ValueError("imratio is None.") assert self.imratio > 0 and self.imratio <= 0.5, 'imratio needs to be in (0, 0.5)!' if self.shuffle: np.random.seed(self.random_seed) idx = np.random.permutation(len(targets)) data, targets = data[idx], targets[idx] num_classes = self._get_class_num(targets) split_index = self._get_split_index(num_classes) targets = np.where(targets <= split_index, 0, 1) if self.imratio < 0.5: neg_ids = np.where(targets == 0)[0] pos_ids = np.where(targets == 1)[0] pos_ids = pos_ids[:int((self.imratio / (1 - self.imratio)) * len(neg_ids))] idx = np.concatenate([neg_ids, pos_ids]) data, targets = data[idx], targets[idx] targets = targets.reshape(-1, 1).astype(np.float32) if self.shuffle: np.random.seed(self.random_seed) idx = np.random.permutation(len(targets)) data, targets = data[idx], targets[idx] if self.verbose: check_imbalance_ratio(targets) return data, targets