From 0e82428eb96e34b286de778a9fc3c44e2f993cb9 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 30 Mar 2019 20:54:20 -0400 Subject: [PATCH] updated lib name --- research_lib/__init__.py | 0 research_lib/models/__init__.py | 0 .../models/model_examples/__init__.py | 0 research_lib/models/model_examples/bilstm.py | 167 ------- .../models/sample_model_template/__init__.py | 0 .../sample_model_template/model_template.py | 203 --------- research_lib/models/trainer.py | 407 ------------------ research_lib/root_module/__init__.py | 0 research_lib/root_module/grads.py | 40 -- research_lib/root_module/hooks.py | 20 - research_lib/root_module/memory.py | 180 -------- research_lib/root_module/model_saving.py | 168 -------- research_lib/root_module/optimization.py | 22 - research_lib/root_module/root_module.py | 167 ------- research_lib/trainer_main.py | 215 --------- research_lib/utils/__init__.py | 0 research_lib/utils/arg_parse.py | 67 --- research_lib/utils/embeddings.py | 107 ----- research_lib/utils/plotting.py | 28 -- research_lib/utils/pt_callbacks.py | 261 ----------- 20 files changed, 2052 deletions(-) delete mode 100644 research_lib/__init__.py delete mode 100644 research_lib/models/__init__.py delete mode 100644 research_lib/models/model_examples/__init__.py delete mode 100644 research_lib/models/model_examples/bilstm.py delete mode 100644 research_lib/models/sample_model_template/__init__.py delete mode 100644 research_lib/models/sample_model_template/model_template.py delete mode 100644 research_lib/models/trainer.py delete mode 100644 research_lib/root_module/__init__.py delete mode 100644 research_lib/root_module/grads.py delete mode 100644 research_lib/root_module/hooks.py delete mode 100644 research_lib/root_module/memory.py delete mode 100644 research_lib/root_module/model_saving.py delete mode 100644 research_lib/root_module/optimization.py delete mode 100644 research_lib/root_module/root_module.py delete mode 100644 research_lib/trainer_main.py delete mode 100644 research_lib/utils/__init__.py delete mode 100644 research_lib/utils/arg_parse.py delete mode 100644 research_lib/utils/embeddings.py delete mode 100644 research_lib/utils/plotting.py delete mode 100644 research_lib/utils/pt_callbacks.py diff --git a/research_lib/__init__.py b/research_lib/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/research_lib/models/__init__.py b/research_lib/models/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/research_lib/models/model_examples/__init__.py b/research_lib/models/model_examples/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/research_lib/models/model_examples/bilstm.py b/research_lib/models/model_examples/bilstm.py deleted file mode 100644 index f221a9d778..0000000000 --- a/research_lib/models/model_examples/bilstm.py +++ /dev/null @@ -1,167 +0,0 @@ -import torch.nn as nn -import numpy as np - -from test_tube import HyperOptArgumentParser -import torch -from torch.autograd import Variable -from sklearn.metrics import confusion_matrix, f1_score -from torch.nn import functional as F - - -class BiLSTMPack(nn.Module): - """ - Sample model to show how to define a template - """ - def __init__(self, hparams): - # init superclass - super(BiLSTMPack, self).__init__(hparams) - - self.hidden = None - - # trigger tag building - self.ner_tagset = {'O': 0, 'I-Bio': 1} - self.nb_tags = len(self.ner_tagset) - - # build model - print('building model...') - if hparams.model_load_weights_path is None: - self.__build_model() - print('model built') - else: - self = BiLSTMPack.load(hparams.model_load_weights_path, hparams.on_gpu, hparams) - print('model loaded from: {}'.format(hparams.model_load_weights_path)) - - def __build_model(self): - """ - Layout model - :return: - """ - # design the number of final units - self.output_dim = self.hparams.nb_lstm_units - - # when it's bidirectional our weights double - if self.hparams.bidirectional: - self.output_dim *= 2 - - # total number of words - total_words = len(self.tng_dataloader.dataset.words_token_to_idx) - - # word embeddings - self.word_embedding = nn.Embedding( - num_embeddings=total_words + 1, - embedding_dim=self.hparams.embedding_dim, - padding_idx=0 - ) - - # design the LSTM - self.lstm = nn.LSTM( - self.hparams.embedding_dim, - self.hparams.nb_lstm_units, - num_layers=self.hparams.nb_lstm_layers, - bidirectional=self.hparams.bidirectional, - dropout=self.hparams.drop_prob, - batch_first=True, - ) - - # map to tag space - self.fc_out = nn.Linear(self.output_dim, self.out_dim) - self.hidden_to_tag = nn.Linear(self.output_dim, self.nb_tags) - - - def init_hidden(self, batch_size): - - # the weights are of the form (nb_layers * 2 if bidirectional, batch_size, nb_lstm_units) - mult = 2 if self.hparams.bidirectional else 1 - hidden_a = torch.randn(self.hparams.nb_layers * mult, batch_size, self.nb_rnn_units) - hidden_b = torch.randn(self.hparams.nb_layers * mult, batch_size, self.nb_rnn_units) - - if self.hparams.on_gpu: - hidden_a = hidden_a.cuda() - hidden_b = hidden_b.cuda() - - hidden_a = Variable(hidden_a) - hidden_b = Variable(hidden_b) - - return (hidden_a, hidden_b) - - def forward(self, model_in): - # layout data (expand it, etc...) - # x = sequences - x, seq_lengths = model_in - batch_size, seq_len = x.size() - - # reset RNN hidden state - self.hidden = self.init_hidden(batch_size) - - # embed - x = self.word_embedding(x) - - # run through rnn using packed sequences - x = torch.nn.utils.rnn.pack_padded_sequence(x, seq_lengths, batch_first=True) - x, self.hidden = self.lstm(x, self.hidden) - x, _ = torch.nn.utils.rnn.pad_packed_sequence(x, batch_first=True) - - # if asked for only last state, use the h_n which is the same as out(t=n) - if not self.return_sequence: - # pull out hidden states - # h_n = (nb_directions * nb_layers, batch_size, emb_size) - nb_directions = 2 if self.bidirectional else 1 - (h_n, _) = self.hidden - - # reshape to make indexing easier - # forward = 0, backward = 1 (of nb_directions) - h_n = h_n.view(self.nb_layers, nb_directions, batch_size, self.nb_rnn_units) - - # pull out last forward - forward_h_n = h_n[-1, 0, :, :] - x = forward_h_n - - # if bidirectional, also pull out the last hidden of backward network - if self.bidirectional: - backward_h_n = h_n[-1, 1, :, :] - x = torch.cat([forward_h_n, backward_h_n], dim=1) - - # project to tag space - x = x.contiguous() - x = x.view(-1, self.output_dim) - x = self.hidden_to_tag(x) - - return x - - def loss(self, model_out): - # cross entropy loss - logits, y = model_out - y, y_lens = y - - # flatten y and logits - y = y.view(-1) - logits = logits.view(-1, self.nb_tags) - - # calculate a mask to remove padding tokens - mask = (y >= 0).float() - - # count how many tokens we have - num_tokens = int(torch.sum(mask).data[0]) - - # pick the correct values and mask out - logits = logits[range(logits.shape[0]), y] * mask - - # compute the ce loss - ce_loss = -torch.sum(logits)/num_tokens - - return ce_loss - - def pull_out_last_embedding(self, x, seq_lengths, batch_size, on_gpu): - # grab only the last activations from the non-padded ouput - x_last = torch.zeros([batch_size, 1, x.size(-1)]) - for i, seq_len in enumerate(seq_lengths): - x_last[i, :, :] = x[i, seq_len-1, :] - - # put on gpu when requested - if on_gpu: - x_last = x_last.cuda() - - # turn into torch var - x_last = Variable(x_last) - - return x_last \ No newline at end of file diff --git a/research_lib/models/sample_model_template/__init__.py b/research_lib/models/sample_model_template/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/research_lib/models/sample_model_template/model_template.py b/research_lib/models/sample_model_template/model_template.py deleted file mode 100644 index 9e47794989..0000000000 --- a/research_lib/models/sample_model_template/model_template.py +++ /dev/null @@ -1,203 +0,0 @@ -import torch.nn as nn -import numpy as np -from research_lib.root_module.root_module import RootModule -from test_tube import HyperOptArgumentParser -from torchvision.datasets import MNIST -import torchvision.transforms as transforms -import torch -import torch.nn.functional as F - - -class ExampleModel1(RootModule): - """ - Sample model to show how to define a template - """ - - def __init__(self, hparams): - # init superclass - super(ExampleModel1, self).__init__(hparams) - - self.batch_size = hparams.batch_size - - # build model - self.__build_model() - - # --------------------- - # MODEL SETUP - # --------------------- - def __build_model(self): - """ - Layout model - :return: - """ - self.c_d1 = nn.Linear(in_features=self.hparams.in_features, out_features=self.hparams.hidden_dim) - self.c_d1_bn = nn.BatchNorm1d(self.hparams.hidden_dim) - self.c_d1_drop = nn.Dropout(self.hparams.drop_prob) - - self.c_d2 = nn.Linear(in_features=self.hparams.hidden_dim, out_features=self.hparams.out_features) - - # --------------------- - # TRAINING - # --------------------- - def forward(self, x): - x = self.c_d1(x) - x = F.tanh(x) - x = self.c_d1_bn(x) - x = self.c_d1_drop(x) - - x = self.c_d2(x) - logits = F.log_softmax(x, dim=1) - - return logits - - def loss(self, labels, logits): - nll = F.nll_loss(logits, labels) - return nll - - def training_step(self, data_batch): - """ - Called inside the training loop - :param data_batch: - :return: - """ - # forward pass - x, y = data_batch - x = x.view(x.size(0), -1) - y_hat = self.forward(x) - - # calculate loss - loss_val = self.loss(y, y_hat) - - tqdm_dic = {'jefe': 1} - return loss_val, tqdm_dic - - def validation_step(self, data_batch): - """ - Called inside the validation loop - :param data_batch: - :return: - """ - x, y = data_batch - x = x.view(x.size(0), -1) - y_hat = self.forward(x) - - loss_val = self.loss(y, y_hat) - - # acc - labels_hat = torch.argmax(y_hat, dim=1) - val_acc = torch.sum(y == labels_hat).item() / (len(y) * 1.0) - - output = {'y_hat': y_hat, 'val_loss': loss_val.item(), 'val_acc': val_acc} - return output - - def validation_end(self, outputs): - """ - Called at the end of validation to aggregate outputs - :param outputs: list of individual outputs of each validation step - :return: - """ - val_loss_mean = 0 - accs = [] - for output in outputs: - val_loss_mean += output['val_loss'] - accs.append(output['val_acc']) - - val_loss_mean /= len(outputs) - tqdm_dic = {'val_loss': val_loss_mean, 'val_acc': np.mean(accs)} - return tqdm_dic - - def update_tng_log_metrics(self, logs): - return logs - - # --------------------- - # MODEL SAVING - # --------------------- - def get_save_dict(self): - checkpoint = { - 'state_dict': self.state_dict(), - } - - return checkpoint - - def load_model_specific(self, checkpoint): - self.load_state_dict(checkpoint['state_dict']) - pass - - # --------------------- - # TRAINING SETUP - # --------------------- - def configure_optimizers(self): - """ - return whatever optimizers we want here - :return: list of optimizers - """ - optimizer = self.choose_optimizer(self.hparams.optimizer_name, self.parameters(), {'lr': self.hparams.learning_rate}, 'optimizer') - self.optimizers = [optimizer] - return self.optimizers - - def __dataloader(self, train): - # init data generators - transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (1.0,))]) - - dataset = MNIST(root=self.hparams.data_root, train=train, transform=transform, download=True) - - loader = torch.utils.data.DataLoader( - dataset=dataset, - batch_size=self.hparams.batch_size, - shuffle=True - ) - - return loader - - @property - def tng_dataloader(self): - if self._tng_dataloader is None: - try: - self._tng_dataloader = self.__dataloader(train=True) - except Exception as e: - print(e) - raise e - return self._tng_dataloader - - @property - def val_dataloader(self): - if self._val_dataloader is None: - try: - self._val_dataloader = self.__dataloader(train=False) - except Exception as e: - print(e) - raise e - return self._val_dataloader - - @property - def test_dataloader(self): - if self._test_dataloader is None: - try: - self._test_dataloader = self.__dataloader(train=False) - except Exception as e: - print(e) - raise e - return self._test_dataloader - - @staticmethod - def add_model_specific_args(parent_parser): - parser = HyperOptArgumentParser(strategy=parent_parser.strategy, parents=[parent_parser]) - - # param overwrites - # parser.set_defaults(gradient_clip=5.0) - - # network params - parser.opt_list('--drop_prob', default=0.2, options=[0.2, 0.5], type=float, tunable=False) - parser.add_argument('--in_features', default=28*28) - parser.add_argument('--hidden_dim', default=500) - parser.add_argument('--out_features', default=10) - - # data - parser.add_argument('--data_root', default='/Users/williamfalcon/Developer/personal/research_lib/research_proj/datasets/mnist', type=str) - - # training params (opt) - parser.opt_list('--learning_rate', default=0.001, type=float, options=[0.0001, 0.0005, 0.001, 0.005], - tunable=False) - parser.opt_list('--batch_size', default=256, type=int, options=[32, 64, 128, 256], tunable=False) - parser.opt_list('--optimizer_name', default='adam', type=str, options=['adam'], tunable=False) - return parser diff --git a/research_lib/models/trainer.py b/research_lib/models/trainer.py deleted file mode 100644 index c344382ae9..0000000000 --- a/research_lib/models/trainer.py +++ /dev/null @@ -1,407 +0,0 @@ -import torch -import tqdm -import numpy as np -from research_lib.root_module.memory import get_gpu_memory_map -import traceback -from research_lib.root_module.model_saving import TrainerIO -from torch.optim.lr_scheduler import MultiStepLR - - -class Trainer(TrainerIO): - - def __init__(self, - experiment, - on_gpu, - cluster, enable_tqdm, - overfit_pct, - track_grad_norm, - fast_dev_run, - check_val_every_n_epoch, - accumulate_grad_batches, - process_position, current_gpu_name, - checkpoint_callback, early_stop_callback, - enable_early_stop, max_nb_epochs, min_nb_epochs, - train_percent_check, val_percent_check, test_percent_check, val_check_interval, - log_save_interval, add_log_row_interval, - lr_scheduler_milestones, - nb_sanity_val_steps=5): - - # Transfer params - self.check_val_every_n_epoch = check_val_every_n_epoch - self.enable_early_stop = enable_early_stop - self.track_grad_norm = track_grad_norm - self.fast_dev_run = fast_dev_run - self.on_gpu = on_gpu - self.enable_tqdm = enable_tqdm - self.experiment = experiment - self.exp_save_path = experiment.get_data_path(experiment.name, experiment.version) - self.cluster = cluster - self.process_position = process_position - self.current_gpu_name = current_gpu_name - self.checkpoint_callback = checkpoint_callback - self.checkpoint_callback.save_function = self.save_checkpoint - self.early_stop = early_stop_callback - self.model = None - self.max_nb_epochs = max_nb_epochs - self.accumulate_grad_batches = accumulate_grad_batches - self.early_stop_callback = early_stop_callback - self.min_nb_epochs = min_nb_epochs - self.nb_sanity_val_steps = nb_sanity_val_steps - self.lr_scheduler_milestones = [] if lr_scheduler_milestones is None else [int(x.strip()) for x in lr_scheduler_milestones.split(',')] - self.lr_schedulers = [] - - # training state - self.optimizers = None - self.prog_bar = None - self.global_step = 0 - self.current_epoch = 0 - self.total_batches = 0 - - # logging - self.log_save_interval = log_save_interval - self.val_check_interval = val_check_interval - self.add_log_row_interval = add_log_row_interval - - # dataloaders - self.tng_dataloader = None - self.test_dataloader = None - self.val_dataloader = None - - # how much of the data to use - self.__determine_data_use_amount(train_percent_check, val_percent_check, test_percent_check, overfit_pct) - print('gpu available: {}, used: {}'.format(torch.cuda.is_available(), self.on_gpu)) - - def __determine_data_use_amount(self, train_percent_check, val_percent_check, test_percent_check, overfit_pct): - """ - Use less data for debugging purposes - """ - self.train_percent_check = train_percent_check - self.val_percent_check = val_percent_check - self.test_percent_check = test_percent_check - if overfit_pct > 0: - self.train_percent_check = overfit_pct - self.val_percent_check = overfit_pct - self.test_percent_check = overfit_pct - - def __is_function_implemented(self, f_name): - f_op = getattr(self, f_name, None) - return callable(f_op) - - @property - def __tng_tqdm_dic(self): - tqdm_dic = { - 'tng_loss': '{0:.3f}'.format(self.avg_loss), - 'gpu': '{}'.format(self.current_gpu_name), - 'v_nb': '{}'.format(self.experiment.version), - 'epoch': '{}'.format(self.current_epoch), - 'batch_nb':'{}'.format(self.batch_nb), - } - tqdm_dic.update(self.tqdm_metrics) - return tqdm_dic - - def __layout_bookeeping(self): - # training bookeeping - self.total_batch_nb = 0 - self.running_loss = [] - self.avg_loss = 0 - self.batch_nb = 0 - self.tqdm_metrics = {} - - # determine number of training batches - nb_tng_batches = self.model.nb_batches(self.tng_dataloader) - self.nb_tng_batches = int(nb_tng_batches * self.train_percent_check) - - # determine number of validation batches - nb_val_batches = self.model.nb_batches(self.val_dataloader) - nb_val_batches = int(nb_val_batches * self.val_percent_check) - nb_val_batches = max(1, nb_val_batches) - self.nb_val_batches = nb_val_batches - - # determine number of test batches - nb_test_batches = self.model.nb_batches(self.test_dataloader) - self.nb_test_batches = int(nb_test_batches * self.test_percent_check) - - # determine when to check validation - self.val_check_batch = int(nb_tng_batches * self.val_check_interval) - - def __add_tqdm_metrics(self, metrics): - for k, v in metrics.items(): - self.tqdm_metrics[k] = v - - def validate(self, model, dataloader, max_batches): - """ - Run validation code - :param model: PT model - :param dataloader: PT dataloader - :param max_batches: Scalar - :return: - """ - print('validating...') - - # enable eval mode - model.zero_grad() - model.eval() - - # disable gradients to save memory - torch.set_grad_enabled(False) - - # bookkeeping - outputs = [] - - # run training - for i, data_batch in enumerate(dataloader): - - if data_batch is None: - continue - - # stop short when on fast dev run - if max_batches is not None and i >= max_batches: - break - - # ----------------- - # RUN VALIDATION STEP - # ----------------- - output = model.validation_step(data_batch) - outputs.append(output) - - # batch done - if self.enable_tqdm and self.prog_bar is not None: - self.prog_bar.update(1) - - # give model a chance to do something with the outputs - val_results = model.validation_end(outputs) - - # enable train mode again - model.train() - - # enable gradients to save memory - torch.set_grad_enabled(True) - return val_results - - def __get_dataloaders(self, model): - """ - Dataloaders are provided by the model - :param model: - :return: - """ - self.tng_dataloader = model.tng_dataloader - self.test_dataloader = model.test_dataloader - self.val_dataloader = model.val_dataloader - - # ----------------------------- - # MODEL TRAINING - # ----------------------------- - def fit(self, model): - self.model = model - - # transfer data loaders from model - self.__get_dataloaders(model) - - # init training constants - self.__layout_bookeeping() - - # CHOOSE OPTIMIZER - # filter out the weights that were done on gpu so we can load on good old cpus - self.optimizers = model.configure_optimizers() - - # add lr schedulers - if self.lr_scheduler_milestones is not None: - for optimizer in self.optimizers: - scheduler = MultiStepLR(optimizer, self.lr_scheduler_milestones) - self.lr_schedulers.append(scheduler) - - # print model summary - model.summarize() - - # put on gpu if needed - if self.on_gpu: - model = model.cuda() - - # run tiny validation to make sure program won't crash during val - _ = self.validate(model, self.val_dataloader, max_batches=self.nb_sanity_val_steps) - - # save exp to get started - self.experiment.save() - - # enable cluster checkpointing - self.enable_auto_hpc_walltime_manager() - - # --------------------------- - # CORE TRAINING LOOP - # --------------------------- - self.__train() - - def __train(self): - # run all epochs - for epoch_nb in range(self.current_epoch, self.max_nb_epochs): - # update the lr scheduler - for lr_scheduler in self.lr_schedulers: - lr_scheduler.step() - - self.model.current_epoch = epoch_nb - - # hook - if self.__is_function_implemented('on_epoch_start'): - self.model.on_epoch_start() - - self.current_epoch = epoch_nb - self.total_batches = self.nb_tng_batches + self.nb_val_batches - self.batch_loss_value = 0 # accumulated grads - - # init progbar when requested - if self.enable_tqdm: - self.prog_bar = tqdm.tqdm(range(self.total_batches), position=self.process_position) - - for batch_nb, data_batch in enumerate(self.tng_dataloader): - self.batch_nb = batch_nb - self.global_step += 1 - self.model.global_step = self.global_step - - # stop when the flag is changed or we've gone past the amount requested in the batches - self.total_batch_nb += 1 - met_batch_limit = batch_nb > self.nb_tng_batches - if met_batch_limit: - break - - # --------------- - # RUN TRAIN STEP - # --------------- - self.__run_tng_batch(data_batch) - - # --------------- - # RUN VAL STEP - # --------------- - is_val_check_batch = (batch_nb + 1) % self.val_check_batch == 0 - if self.fast_dev_run or is_val_check_batch: - self.__run_validation() - - # when batch should be saved - if (batch_nb + 1) % self.log_save_interval == 0: - self.experiment.save() - - # when metrics should be logged - if batch_nb % self.add_log_row_interval == 0: - # count items in memory - # nb_params, nb_tensors = count_mem_items() - - metrics = self.model.update_tng_log_metrics(self.__tng_tqdm_dic) - - # add gpu memory - if self.on_gpu: - mem_map = get_gpu_memory_map() - metrics.update(mem_map) - - # add norms - if self.track_grad_norm > 0: - grad_norm_dic = self.model.grad_norm(self.track_grad_norm) - metrics.update(grad_norm_dic) - - # log metrics - self.experiment.log(metrics) - self.experiment.save() - - # hook - if self.__is_function_implemented('on_batch_end'): - self.model.on_batch_end() - - # hook - if self.__is_function_implemented('on_epoch_end'): - self.model.on_epoch_end() - - # early stopping - if self.enable_early_stop: - should_stop = self.early_stop_callback.on_epoch_end(epoch=epoch_nb, logs=self.__tng_tqdm_dic) - met_min_epochs = epoch_nb > self.min_nb_epochs - - # stop training - stop = should_stop and met_min_epochs - if stop: - return - - def __run_tng_batch(self, data_batch): - if data_batch is None: - return - - # hook - if self.__is_function_implemented('on_batch_start'): - self.model.on_batch_start() - - if self.enable_tqdm: - self.prog_bar.update(1) - - # forward pass - # return a scalar value and a dic with tqdm metrics - loss, model_specific_tqdm_metrics_dic = self.model.training_step(data_batch) - self.__add_tqdm_metrics(model_specific_tqdm_metrics_dic) - - # backward pass - loss.backward() - self.batch_loss_value += loss.item() - - # gradient update with accumulated gradients - if (self.batch_nb + 1) % self.accumulate_grad_batches == 0: - - # update gradients across all optimizers - for optimizer in self.optimizers: - optimizer.step() - - # clear gradients - optimizer.zero_grad() - - # queuing loss across batches blows it up proportionally... divide out the number accumulated - self.batch_loss_value = self.batch_loss_value / self.accumulate_grad_batches - - # track loss - self.running_loss.append(self.batch_loss_value) - self.batch_loss_value = 0 - self.avg_loss = np.mean(self.running_loss[-100:]) - - # update progbar - if self.enable_tqdm: - # add model specific metrics - tqdm_metrics = self.__tng_tqdm_dic - self.prog_bar.set_postfix(**tqdm_metrics) - - # activate batch end hook - if self.__is_function_implemented('on_batch_end'): - self.model.on_batch_end() - - def __run_validation(self): - # decide if can check epochs - can_check_epoch = (self.current_epoch + 1) % self.check_val_every_n_epoch == 0 - if self.fast_dev_run: - print('skipping to check performance bc of --fast_dev_run') - elif not can_check_epoch: - return - - try: - # hook - if self.__is_function_implemented('on_pre_performance_check'): - self.model.on_pre_performance_check() - - # use full val set on end of epoch - # use a small portion otherwise - max_batches = None if not self.fast_dev_run else 1 - model_specific_tqdm_metrics_dic = self.validate( - self.model, - self.val_dataloader, - max_batches - ) - self.__add_tqdm_metrics(model_specific_tqdm_metrics_dic) - - # hook - if self.__is_function_implemented('on_post_performance_check'): - self.model.on_post_performance_check() - - except Exception as e: - print(e) - print(traceback.print_exc()) - - if self.enable_tqdm: - # add model specific metrics - tqdm_metrics = self.__tng_tqdm_dic - self.prog_bar.set_postfix(**tqdm_metrics) - - # model checkpointing - print('save callback...') - self.checkpoint_callback.on_epoch_end(epoch=self.current_epoch, logs=self.__tng_tqdm_dic) diff --git a/research_lib/root_module/__init__.py b/research_lib/root_module/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/research_lib/root_module/grads.py b/research_lib/root_module/grads.py deleted file mode 100644 index e4d1701b4c..0000000000 --- a/research_lib/root_module/grads.py +++ /dev/null @@ -1,40 +0,0 @@ -import numpy as np -from torch import nn - -""" -Module to describe gradients -""" - - -class GradInformation(nn.Module): - - def grad_norm(self, norm_type): - results = {} - total_norm = 0 - for i, p in enumerate(self.parameters()): - if p.requires_grad: - try: - param_norm = p.grad.data.norm(norm_type) - total_norm += param_norm ** norm_type - norm = param_norm ** (1 / norm_type) - - results['grad_{}_norm_{}'.format(norm_type, i)] = round(norm.data.cpu().numpy().flatten()[0], 3) - except Exception as e: - # this param had no grad - pass - - total_norm = total_norm ** (1. / norm_type) - results['grad_{}_norm_total'.format(norm_type)] = round(total_norm.data.cpu().numpy().flatten()[0], 3) - return results - - - def describe_grads(self): - for p in self.parameters(): - g = p.grad.data.numpy().flatten() - print(np.max(g), np.min(g), np.mean(g)) - - - def describe_params(self): - for p in self.parameters(): - g = p.data.numpy().flatten() - print(np.max(g), np.min(g), np.mean(g)) \ No newline at end of file diff --git a/research_lib/root_module/hooks.py b/research_lib/root_module/hooks.py deleted file mode 100644 index 3517f571db..0000000000 --- a/research_lib/root_module/hooks.py +++ /dev/null @@ -1,20 +0,0 @@ -import torch - -class ModelHooks(torch.nn.Module): - def on_batch_start(self): - pass - - def on_batch_end(self): - pass - - def on_epoch_start(self): - pass - - def on_epoch_end(self): - pass - - def on_pre_performance_check(self): - pass - - def on_post_performance_check(self): - pass diff --git a/research_lib/root_module/memory.py b/research_lib/root_module/memory.py deleted file mode 100644 index 17f20efe64..0000000000 --- a/research_lib/root_module/memory.py +++ /dev/null @@ -1,180 +0,0 @@ -import torch -import gc -import subprocess -import numpy as np -import pandas as pd - - -''' -Generates a summary of a model's layers and dimensionality -''' - - -class ModelSummary(object): - - def __init__(self, model): - ''' - Generates summaries of model layers and dimensions. - ''' - self.model = model - self.in_sizes = [] - self.out_sizes = [] - - self.summarize() - - def __str__(self): - return self.summary.__str__() - - def __repr__(self): - return self.summary.__str__() - - def get_variable_sizes(self): - '''Run sample input through each layer to get output sizes''' - mods = list(self.model.modules()) - in_sizes = [] - out_sizes = [] - input_ = self.example_input_array - for i in range(1, len(mods)): - m = mods[i] - if type(input_) is list or type(input_) is tuple: - out = m(*input_) - else: - out = m(input_) - - if type(input_) is tuple or type(input_) is list: - in_size = [] - for x in input_: - if type(x) is list: - in_size.append(len(x)) - else: - in_size.append(x.size()) - else: - in_size = np.array(input_.size()) - - in_sizes.append(in_size) - - if type(out) is tuple or type(out) is list: - out_size = np.asarray([x.size() for x in out]) - else: - out_size = np.array(out.size()) - - out_sizes.append(out_size) - input_ = out - - self.in_sizes = in_sizes - self.out_sizes = out_sizes - return - - def get_layer_names(self): - '''Collect Layer Names''' - mods = list(self.model.named_modules()) - names = [] - layers = [] - for m in mods[1:]: - names += [m[0]] - layers += [str(m[1].__class__)] - - layer_types = [x.split('.')[-1][:-2] for x in layers] - - self.layer_names = names - self.layer_types = layer_types - return - - def get_parameter_sizes(self): - '''Get sizes of all parameters in `model`''' - mods = list(self.model.modules()) - sizes = [] - - for i in range(1,len(mods)): - m = mods[i] - p = list(m.parameters()) - modsz = [] - for j in range(len(p)): - modsz.append(np.array(p[j].size())) - sizes.append(modsz) - - self.param_sizes = sizes - return - - def get_parameter_nums(self): - '''Get number of parameters in each layer''' - param_nums = [] - for mod in self.param_sizes: - all_params = 0 - for p in mod: - all_params += np.prod(p) - param_nums.append(all_params) - self.param_nums = param_nums - return - - def make_summary(self): - ''' - Makes a summary listing with: - - Layer Name, Layer Type, Input Size, Output Size, Number of Parameters - ''' - - df = pd.DataFrame( np.zeros( (len(self.layer_names), 3) ) ) - df.columns = ['Name', 'Type', 'Params'] - - df['Name'] = self.layer_names - df['Type'] = self.layer_types - df['Params'] = self.param_nums - - self.summary = df - return - - def summarize(self): - self.get_layer_names() - self.get_parameter_sizes() - self.get_parameter_nums() - self.make_summary() - - -def print_mem_stack(): - for obj in gc.get_objects(): - try: - if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)): - print(type(obj), obj.size()) - except Exception as e: - pass - - -def count_mem_items(): - nb_params = 0 - nb_tensors = 0 - for obj in gc.get_objects(): - try: - if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)): - obj_type = str(type(obj)) - if 'parameter' in obj_type: - nb_params += 1 - else: - nb_tensors += 1 - except Exception as e: - pass - - return nb_params, nb_tensors - - -def get_gpu_memory_map(): - """Get the current gpu usage. - - Returns - ------- - usage: dict - Keys are device ids as integers. - Values are memory usage as integers in MB. - """ - result = subprocess.check_output( - [ - 'nvidia-smi', '--query-gpu=memory.used', - '--format=csv,nounits,noheader' - ], encoding='utf-8') - # Convert lines into a dictionary - gpu_memory = [int(x) for x in result.strip().split('\n')] - gpu_memory_map = {} - for k, v in zip(range(len(gpu_memory)), gpu_memory): - k = f'gpu_{k}' - gpu_memory_map[k] = v - return gpu_memory_map diff --git a/research_lib/root_module/model_saving.py b/research_lib/root_module/model_saving.py deleted file mode 100644 index 0e588eddb6..0000000000 --- a/research_lib/root_module/model_saving.py +++ /dev/null @@ -1,168 +0,0 @@ -import torch -import os -import re - - -class ModelIO(object): - - def load_model_specific(self, checkpoint): - """ - Do something with the checkpoint - :param checkpoint: - :return: - """ - raise NotImplementedError - - def get_save_dict(self): - """ - Return specific things for the model - :return: - """ - raise NotImplementedError - - -class TrainerIO(object): - - # -------------------- - # MODEL SAVE CHECKPOINT - # -------------------- - def save_checkpoint(self, filepath): - checkpoint = self.dump_checkpoint() - - # do the actual save - torch.save(checkpoint, filepath) - - def dump_checkpoint(self): - checkpoint = { - 'epoch': self.current_epoch, - 'checkpoint_callback_best': self.checkpoint_callback.best, - 'early_stop_callback_wait': self.early_stop_callback.wait, - 'early_stop_callback_patience': self.early_stop_callback.patience, - 'global_step': self.global_step - } - - optimizer_states = [] - for i, optimizer in enumerate(self.optimizers): - optimizer_states.append(optimizer.state_dict()) - - checkpoint['optimizer_states'] = optimizer_states - - # request what to save from the model - checkpoint_dict = self.model.get_save_dict() - - # merge trainer and model saving items - checkpoint.update(checkpoint_dict) - return checkpoint - - # -------------------- - # HPC IO - # -------------------- - def enable_auto_hpc_walltime_manager(self): - if self.cluster is None: - return - - # allow test tube to handle model check pointing automatically - self.cluster.set_checkpoint_save_function( - self.hpc_save, - kwargs={ - 'folderpath': self.checkpoint_callback.filepath, - 'experiment': self.experiment - } - ) - self.cluster.set_checkpoint_load_function( - self.hpc_load, - kwargs={ - 'folderpath': self.checkpoint_callback.filepath, - 'on_gpu': self.on_gpu - } - ) - - def restore_training_state(self, checkpoint): - """ - Restore trainer state. - Model will get its change to update - :param checkpoint: - :return: - """ - self.checkpoint_callback.best = checkpoint['checkpoint_callback_best'] - self.early_stop_callback.wait = checkpoint['early_stop_callback_wait'] - self.early_stop_callback.patience = checkpoint['early_stop_callback_patience'] - self.global_step = checkpoint['global_step'] - - # restore the optimizers - optimizer_states = checkpoint['optimizer_states'] - for optimizer, opt_state in zip(self.optimizers, optimizer_states): - optimizer.load_state_dict(opt_state) - - # ---------------------------------- - # PRIVATE OPS - # ---------------------------------- - def hpc_save(self, folderpath, experiment): - # save exp to make sure we get all the metrics - experiment.save() - - ckpt_number = self.max_ckpt_in_folder(folderpath) + 1 - - if not os.path.exists(folderpath): - os.makedirs(folderpath, exist_ok=True) - filepath = '{}/hpc_ckpt_{}.ckpt'.format(folderpath, ckpt_number) - - # request what to save from the model - checkpoint_dict = self.dump_checkpoint() - - # do the actual save - torch.save(checkpoint_dict, filepath) - - def hpc_load(self, folderpath, on_gpu): - filepath = '{}/hpc_ckpt_{}.ckpt'.format(folderpath, self.max_ckpt_in_folder(folderpath)) - - if on_gpu: - checkpoint = torch.load(filepath) - else: - checkpoint = torch.load(filepath, map_location=lambda storage, loc: storage) - - # load training state - self.restore_training_state(checkpoint) - - # load model state - self.model.load_model_specific(checkpoint) - - def max_ckpt_in_folder(self, path): - files = os.listdir(path) - ckpt_vs = [] - for name in files: - name = name.split('ckpt_')[-1] - name = re.sub('[^0-9]', '', name) - ckpt_vs.append(int(name)) - - return max(ckpt_vs) - - -def load_hparams_from_tags_csv(tags_csv): - from argparse import Namespace - import pandas as pd - - tags_df = pd.read_csv(tags_csv) - dic = tags_df.to_dict(orient='records') - - ns_dict = {row['key']: convert(row['value']) for row in dic} - - ns = Namespace(**ns_dict) - return ns - - -def convert(val): - constructors = [int, float, str] - - if type(val) is str: - if val.lower() == 'true': - return True - if val.lower() == 'false': - return False - - for c in constructors: - try: - return c(val) - except ValueError: - pass - return val diff --git a/research_lib/root_module/optimization.py b/research_lib/root_module/optimization.py deleted file mode 100644 index 3172e1a1e6..0000000000 --- a/research_lib/root_module/optimization.py +++ /dev/null @@ -1,22 +0,0 @@ -from torch import nn -from torch import optim - - -class OptimizerConfig(nn.Module): - - def choose_optimizer(self, optimizer, params, optimizer_params, opt_name_key): - if optimizer == 'adam': - optimizer = optim.Adam(params, **optimizer_params) - if optimizer == 'sparse_adam': - optimizer = optim.SparseAdam(params, **optimizer_params) - if optimizer == 'sgd': - optimizer = optim.SGD(params, **optimizer_params) - if optimizer == 'adadelta': - optimizer = optim.Adadelta(params, **optimizer_params) - - # transfer opt state if loaded - if opt_name_key in self.loaded_optimizer_states_dict: - state = self.loaded_optimizer_states_dict[opt_name_key] - optimizer.load_state_dict(state) - - return optimizer diff --git a/research_lib/root_module/root_module.py b/research_lib/root_module/root_module.py deleted file mode 100644 index c8e3090ab6..0000000000 --- a/research_lib/root_module/root_module.py +++ /dev/null @@ -1,167 +0,0 @@ -import os -import torch -import math - -from research_lib.root_module.memory import ModelSummary -from research_lib.root_module.grads import GradInformation -from research_lib.root_module.model_saving import ModelIO, load_hparams_from_tags_csv -from research_lib.root_module.optimization import OptimizerConfig -from research_lib.root_module.hooks import ModelHooks - - -class RootModule(GradInformation, ModelIO, OptimizerConfig, ModelHooks): - - def __init__(self, hparams): - super(RootModule, self).__init__() - self.hparams = hparams - self.on_gpu = hparams.on_gpu - self.dtype = torch.FloatTensor - self.exp_save_path = None - self.current_epoch = 0 - self.global_step = 0 - self.loaded_optimizer_states_dict = {} - self.fast_dev_run = hparams.fast_dev_run - self.overfit = hparams.overfit - self.gradient_clip = hparams.gradient_clip - self.num = 2 - - # computed vars for the dataloaders - self._tng_dataloader = None - self._val_dataloader = None - self._test_dataloader = None - - if self.on_gpu: - print('running on gpu...') - self.dtype = torch.cuda.FloatTensor - torch.set_default_tensor_type('torch.cuda.FloatTensor') - - def forward(self, *args, **kwargs): - """ - Expand model in into whatever you need. - Also need to return the target - :param x: - :return: - """ - raise NotImplementedError - - def validation_step(self, data_batch): - """ - return whatever outputs will need to be aggregated in validation_end - :param data_batch: - :return: - """ - raise NotImplementedError - - def validation_end(self, outputs): - """ - Outputs has the appended output after each validation step - :param outputs: - :return: dic_with_metrics for tqdm - """ - raise NotImplementedError - - def training_step(self, data_batch): - """ - return loss, dict with metrics for tqdm - :param data_batch: - :return: - """ - raise NotImplementedError - - def configure_optimizers(self): - """ - Return array of optimizers - :return: - """ - raise NotImplementedError - - def update_tng_log_metrics(self, logs): - """ - Chance to update metrics to be logged for training step. - For example, add music, images, etc... to log - :param logs: - :return: - """ - raise NotImplementedError - - def loss(self, *args, **kwargs): - """ - Expand model_out into your components - :param model_out: - :return: - """ - raise NotImplementedError - - def summarize(self): - model_summary = ModelSummary(self) - print(model_summary) - - def nb_batches(self, dataloader): - a = math.ceil(float(len(dataloader.dataset) / self.batch_size)) - return int(a) - - def freeze(self): - for param in self.parameters(): - param.requires_grad = False - - def unfreeze(self): - for param in self.parameters(): - param.requires_grad = True - - @property - def tng_dataloader(self): - """ - Implement a function to load an h5py of this data - :return: - """ - raise NotImplementedError - - @property - def test_dataloader(self): - """ - Implement a function to load an h5py of this data - :return: - """ - raise NotImplementedError - - @property - def val_dataloader(self): - """ - Implement a function to load an h5py of this data - :return: - """ - raise NotImplementedError - - @staticmethod - def get_process_position(gpus): - try: - current_gpu = os.environ["CUDA_VISIBLE_DEVICES"] - gpu_ids = gpus.split(';') - process_position = gpu_ids.index(current_gpu) - return process_position, current_gpu - except Exception as e: - return 0, 0 - - @classmethod - def load_from_metrics(cls, weights_path, tags_csv, on_gpu): - """ - Primary way of loading model from csv weights path - :param weights_path: - :param tags_csv: - :param on_gpu: - :return: - """ - hparams = load_hparams_from_tags_csv(tags_csv) - hparams.__setattr__('on_gpu', on_gpu) - - if on_gpu: - checkpoint = torch.load(weights_path) - else: - checkpoint = torch.load(weights_path, map_location=lambda storage, loc: storage) - - model = cls(hparams) - - # allow model to load - model.load_model_specific(checkpoint) - model.load_state_dict(checkpoint['state_dict'], strict=False) - return model diff --git a/research_lib/trainer_main.py b/research_lib/trainer_main.py deleted file mode 100644 index f630fcefb8..0000000000 --- a/research_lib/trainer_main.py +++ /dev/null @@ -1,215 +0,0 @@ -import os -import sys - -import torch -import numpy as np -from test_tube import HyperOptArgumentParser, Experiment, SlurmCluster -from research_lib.models.trainer import Trainer -from research_lib.utils.arg_parse import add_default_args -from time import sleep - -from research_lib.utils.pt_callbacks import EarlyStopping, ModelCheckpoint - -SEED = 2334 -torch.manual_seed(SEED) -np.random.seed(SEED) - -# --------------------- -# DEFINE MODEL HERE -# --------------------- -from research_lib.models.sample_model_template.model_template import ExampleModel1 -# --------------------- - -AVAILABLE_MODELS = { - 'model_1': ExampleModel1 -} - - -""" -Allows training by using command line arguments - -Run by: -# TYPE YOUR RUN COMMAND HERE -""" - - -def main_local(hparams): - main(hparams, None, None) - - -def main(hparams, cluster, results_dict): - """ - Main training routine specific for this project - :param hparams: - :return: - """ - on_gpu = torch.cuda.is_available() - if hparams.disable_cuda: - on_gpu = False - - device = 'cuda' if on_gpu else 'cpu' - hparams.__setattr__('device', device) - hparams.__setattr__('on_gpu', on_gpu) - hparams.__setattr__('nb_gpus', torch.cuda.device_count()) - hparams.__setattr__('inference_mode', hparams.model_load_weights_path is not None) - - # delay each training start to not overwrite logs - process_position, current_gpu = TRAINING_MODEL.get_process_position(hparams.gpus) - sleep(process_position + 1) - - # init experiment - exp = Experiment( - name=hparams.tt_name, - debug=hparams.debug, - save_dir=hparams.tt_save_path, - version=hparams.hpc_exp_number, - autosave=False, - description=hparams.tt_description - ) - - exp.argparse(hparams) - exp.save() - - # build model - print('loading model...') - model = TRAINING_MODEL(hparams) - print('model built') - - # callbacks - early_stop = EarlyStopping( - monitor=hparams.early_stop_metric, - patience=hparams.early_stop_patience, - verbose=True, - mode=hparams.early_stop_mode - ) - - model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version) - checkpoint = ModelCheckpoint( - filepath=model_save_path, - save_function=None, - save_best_only=True, - verbose=True, - monitor=hparams.model_save_monitor_value, - mode=hparams.model_save_monitor_mode - ) - - # configure trainer - trainer = Trainer( - experiment=exp, - on_gpu=on_gpu, - cluster=cluster, - enable_tqdm=hparams.enable_tqdm, - overfit_pct=hparams.overfit, - track_grad_norm=hparams.track_grad_norm, - fast_dev_run=hparams.fast_dev_run, - check_val_every_n_epoch=hparams.check_val_every_n_epoch, - accumulate_grad_batches=hparams.accumulate_grad_batches, - process_position=process_position, - current_gpu_name=current_gpu, - checkpoint_callback=checkpoint, - early_stop_callback=early_stop, - enable_early_stop=hparams.enable_early_stop, - max_nb_epochs=hparams.max_nb_epochs, - min_nb_epochs=hparams.min_nb_epochs, - train_percent_check=hparams.train_percent_check, - val_percent_check=hparams.val_percent_check, - test_percent_check=hparams.test_percent_check, - val_check_interval=hparams.val_check_interval, - log_save_interval=hparams.log_save_interval, - add_log_row_interval=hparams.add_log_row_interval, - lr_scheduler_milestones=hparams.lr_scheduler_milestones - ) - - # train model - trainer.fit(model) - - -def get_default_parser(strategy, root_dir): - - possible_model_names = list(AVAILABLE_MODELS.keys()) - parser = HyperOptArgumentParser(strategy=strategy, add_help=False) - add_default_args(parser, root_dir, possible_model_names, SEED) - return parser - - -def get_model_name(args): - for i, arg in enumerate(args): - if 'model_name' in arg: - return args[i+1] - - -def optimize_on_cluster(hyperparams): - # enable cluster training - cluster = SlurmCluster( - hyperparam_optimizer=hyperparams, - log_path=hyperparams.tt_save_path, - test_tube_exp_name=hyperparams.tt_name - ) - - # email for cluster coms - cluster.notify_job_status(email='add_email_here', on_done=True, on_fail=True) - - # configure cluster - cluster.per_experiment_nb_gpus = hyperparams.per_experiment_nb_gpus - cluster.job_time = '48:00:00' - cluster.gpu_type = '1080ti' - cluster.memory_mb_per_node = 48000 - - # any modules for code to run in env - cluster.add_command('source activate research_lib') - - # name of exp - job_display_name = hyperparams.tt_name.split('_')[0] - job_display_name = job_display_name[0:3] - - # run hopt - print('submitting jobs...') - cluster.optimize_parallel_cluster_gpu( - main, - nb_trials=hyperparams.nb_hopt_trials, - job_name=job_display_name - ) - - -if __name__ == '__main__': - - model_name = get_model_name(sys.argv) - - # use default args - root_dir = os.path.split(os.path.dirname(sys.modules['__main__'].__file__))[0] - parent_parser = get_default_parser(strategy='random_search', root_dir=root_dir) - - # allow model to overwrite or extend args - TRAINING_MODEL = AVAILABLE_MODELS[model_name] - parser = TRAINING_MODEL.add_model_specific_args(parent_parser) - parser.json_config('-c', '--config', default=root_dir + '/run_configs/local.json') - hyperparams = parser.parse_args() - - # format GPU layout - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" - gpu_ids = hyperparams.gpus.split(';') - - # RUN TRAINING - if hyperparams.on_cluster: - print('RUNNING ON SLURM CLUSTER') - os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(gpu_ids) - optimize_on_cluster(hyperparams) - - elif hyperparams.single_run_gpu: - print(f'RUNNING 1 TRIAL ON GPU. gpu: {gpu_ids[0]}') - os.environ["CUDA_VISIBLE_DEVICES"] = gpu_ids[0] - main(hyperparams, None, None) - - elif hyperparams.local or hyperparams.single_run: - os.environ["CUDA_VISIBLE_DEVICES"] = '0' - print('RUNNING LOCALLY') - main(hyperparams, None, None) - - else: - print(f'RUNNING MULTI GPU. GPU ids: {gpu_ids}') - hyperparams.optimize_parallel_gpu( - main_local, - gpu_ids=gpu_ids, - nb_trials=hyperparams.nb_hopt_trials, - nb_workers=len(gpu_ids) - ) diff --git a/research_lib/utils/__init__.py b/research_lib/utils/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/research_lib/utils/arg_parse.py b/research_lib/utils/arg_parse.py deleted file mode 100644 index a3302b0009..0000000000 --- a/research_lib/utils/arg_parse.py +++ /dev/null @@ -1,67 +0,0 @@ -def add_default_args(parser, root_dir, possible_model_names, rand_seed): - - # tng, test, val check intervals - parser.add_argument('--eval_test_set', dest='eval_test_set', action='store_true', help='true = run test set also') - parser.add_argument('--check_val_every_n_epoch', default=1, type=int, help='check val every n epochs') - parser.opt_list('--accumulate_grad_batches', default=1, type=int, tunable=False, - help='accumulates gradients k times before applying update. Simulates huge batch size') - parser.add_argument('--max_nb_epochs', default=200, type=int, help='cap epochs') - parser.add_argument('--min_nb_epochs', default=2, type=int, help='min epochs') - parser.add_argument('--train_percent_check', default=1.0, type=float, help='how much of tng set to check') - parser.add_argument('--val_percent_check', default=1.0, type=float, help='how much of val set to check') - parser.add_argument('--test_percent_check', default=1.0, type=float, help='how much of test set to check') - - parser.add_argument('--val_check_interval', default=0.95, type=float, help='how much within 1 epoch to check val') - parser.add_argument('--log_save_interval', default=100, type=int, help='how many batches between log saves') - parser.add_argument('--add_log_row_interval', default=100, type=int, help='add log every k batches') - - # early stopping - parser.add_argument('--disable_early_stop', dest='enable_early_stop', action='store_false') - parser.add_argument('--early_stop_metric', default='val_acc', type=str) - parser.add_argument('--early_stop_mode', default='min', type=str) - parser.add_argument('--early_stop_patience', default=3, type=int, help='number of epochs until stop') - - # gradient handling - parser.add_argument('--gradient_clip', default=-1, type=int) - parser.add_argument('--track_grad_norm', default=-1, type=int, help='if > 0, will track this grad norm') - - # model saving - parser.add_argument('--model_save_path', default=root_dir + '/model_weights') - parser.add_argument('--model_save_monitor_value', default='val_acc') - parser.add_argument('--model_save_monitor_mode', default='max') - - # model paths - parser.add_argument('--model_load_weights_path', default=None, type=str) - parser.add_argument('--model_name', default='', help=','.join(possible_model_names)) - - # test_tube settings - parser.add_argument('-en', '--tt_name', default='r_lib_') - parser.add_argument('-td', '--tt_description', default='test research lib') - parser.add_argument('--tt_save_path', default=root_dir + '/test_tube_logs', help='logging dir') - parser.add_argument('--enable_single_run', dest='single_run', action='store_true') - parser.add_argument('--nb_hopt_trials', default=1, type=int) - parser.add_argument('--log_stdout', dest='log_stdout', action='store_true') - - # GPU - parser.add_argument('--per_experiment_nb_gpus', default=1, type=int) - parser.add_argument('--gpus', default='0', type=str) - parser.add_argument('--single_run_gpu', dest='single_run_gpu', action='store_true') - parser.add_argument('--disable_cuda', dest='disable_cuda', action='store_true') - - # run on hpc - parser.add_argument('--on_cluster', dest='on_cluster', action='store_true') - - # FAST training - # use these settings to make sure network has no bugs without running a full dataset - parser.add_argument('--fast_dev_run', dest='fast_dev_run', default=False, action='store_true', help='runs validation after 1 tng step') - parser.add_argument('--enable_tqdm', dest='enable_tqdm', default=False, action='store_true', help='false removes the prog bar') - parser.add_argument('--overfit', default=-1, type=float, help='% of dataset to use with this option. float, or -1 for none') - - # debug args - parser.add_argument('--random_seed', default=rand_seed, type=int) - parser.add_argument('--live', dest='live', action='store_true', help='runs on gpu without cluster') - parser.add_argument('--enable_debug', dest='debug', action='store_true', help='enables/disables test tube') - parser.add_argument('--enable_local', dest='local', action='store_true', help='enables local tng') - - # optimizer - parser.add_argument('--lr_scheduler_milestones', default=None, type=str) \ No newline at end of file diff --git a/research_lib/utils/embeddings.py b/research_lib/utils/embeddings.py deleted file mode 100644 index 5f064fcb98..0000000000 --- a/research_lib/utils/embeddings.py +++ /dev/null @@ -1,107 +0,0 @@ -import torch -import numpy as np -from copy import deepcopy - - -class PretrainedEmbedding(torch.nn.Embedding): - - def __init__(self, embedding_path, embedding_dim, task_vocab, freeze=True, *args, **kwargs): - """ - Loads a prebuilt pytorch embedding from any embedding formated file. - Padding=0 by default. - - >>> emb = PretrainedEmbedding(embedding_path='glove.840B.300d.txt',embedding_dim=300, task_vocab={'hello': 1, 'world': 2}) - >>> data = torch.Tensor([[0, 1], [0, 2]]).long() - >>> embedded = emb(data) - tensor([[[ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000], - [ 0.2523, 0.1018, -0.6748, ..., 0.1787, -0.5192, 0.3359]], - - [[ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000], - [-0.0067, 0.2224, 0.2771, ..., 0.0594, 0.0014, 0.0987]]]) - - - :param embedding_path: - :param emb_dim: - :param task_vocab: - :param freeze: - :return: - """ - # count the vocab - self.vocab_size = max(task_vocab.values()) + 1 - super(PretrainedEmbedding, self).__init__(self.vocab_size, embedding_dim, padding_idx=0, *args, **kwargs) - - # load pretrained embeddings - new_emb = self.__load_task_specific_embeddings(deepcopy(task_vocab), embedding_path, embedding_dim, freeze) - - # transfer weights - self.weight = new_emb.weight - - # apply freeze - self.weight.requires_grad = not freeze - - def __load_task_specific_embeddings(self, vocab_words, embedding_path, emb_dim, freeze): - """ - Iterates embedding file to only pull out task specific embeddings - :param vocab_words: - :param embedding_path: - :param emb_dim: - :param freeze: - :return: - """ - - # holds final embeddings for relevant words - embeddings = np.zeros(shape=(self.vocab_size, emb_dim)) - - # load embedding line by line and extract relevant embeddings - with open(embedding_path, encoding='utf-8') as f: - for line in f: - tokens = line.split(' ') - word = tokens[0] - embedding = tokens[1:] - embedding[-1] = embedding[-1][:-1] # remove last new line - - if word in vocab_words: - vocab_word_i = vocab_words[word] - - # skip words that try to overwrite pad idx - if vocab_word_i == 0: - del vocab_words[word] - continue - - emb_vals = np.asarray([float(x) for x in embedding]) - embeddings[vocab_word_i] = emb_vals - - # remove vocab word to early terminate - del vocab_words[word] - - # early break - if len(vocab_words) == 0: - break - - # add random vectors for the non-pretrained words - # these are vocab words NOT found in the pretrained embeddings - for w, i in vocab_words.items(): - # skip words that try to overwrite pad idx - if i == 0: - continue - - embedding = np.random.normal(size=emb_dim) - embeddings[i] = embedding - - # turn into pt embedding - embeddings = torch.FloatTensor(embeddings) - embeddings = torch.nn.Embedding.from_pretrained(embeddings, freeze=freeze) - - return embeddings - - -if __name__ == '__main__': - emb = PretrainedEmbedding( - embedding_path='/Users/waf/Developer/NGV/research-fermat/fermat/.vector_cache/glove.840B.300d.txt', - embedding_dim=300, - task_vocab={'hello': 1, 'world': 2} - ) - - data = torch.Tensor([[0, 1], [0, 2]]).long() - embedded = emb(data) - print(embedded) \ No newline at end of file diff --git a/research_lib/utils/plotting.py b/research_lib/utils/plotting.py deleted file mode 100644 index 95fe64cc50..0000000000 --- a/research_lib/utils/plotting.py +++ /dev/null @@ -1,28 +0,0 @@ -from matplotlib import pyplot as plt -import numpy as np -np.seterr(divide='ignore', invalid='ignore') - - -def plot_confusion_matrix(cm, - save_path, - normalize=False, - title='Confusion matrix', - ylabel='y', - xlabel='x'): - """ - This function prints and plots the confusion matrix. - Normalization can be applied by setting `normalize=True`. - """ - if normalize: - cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] - print("Normalized confusion matrix") - else: - print('Confusion matrix, without normalization') - - fig = plt.figure() - plt.matshow(cm) - plt.title(title) - plt.colorbar() - plt.ylabel(ylabel) - plt.xlabel(xlabel) - plt.savefig(save_path) diff --git a/research_lib/utils/pt_callbacks.py b/research_lib/utils/pt_callbacks.py deleted file mode 100644 index 6c3888139e..0000000000 --- a/research_lib/utils/pt_callbacks.py +++ /dev/null @@ -1,261 +0,0 @@ -import numpy as np -import os, shutil - - -class Callback(object): - """Abstract base class used to build new callbacks. - # Properties - params: dict. Training parameters - (eg. verbosity, batch size, number of epochs...). - model: instance of `keras.models.Model`. - Reference of the model being trained. - The `logs` dictionary that callback methods - take as argument will contain keys for quantities relevant to - the current batch or epoch. - Currently, the `.fit()` method of the `Sequential` model class - will include the following quantities in the `logs` that - it passes to its callbacks: - on_epoch_end: logs include `acc` and `loss`, and - optionally include `val_loss` - (if validation is enabled in `fit`), and `val_acc` - (if validation and accuracy monitoring are enabled). - on_batch_begin: logs include `size`, - the number of samples in the current batch. - on_batch_end: logs include `loss`, and optionally `acc` - (if accuracy monitoring is enabled). - """ - - def __init__(self): - self.validation_data = None - self.model = None - - def set_params(self, params): - self.params = params - - def set_model(self, model): - self.model = model - - def on_epoch_begin(self, epoch, logs=None): - pass - - def on_epoch_end(self, epoch, logs=None): - pass - - def on_batch_begin(self, batch, logs=None): - pass - - def on_batch_end(self, batch, logs=None): - pass - - def on_train_begin(self, logs=None): - pass - - def on_train_end(self, logs=None): - pass - - -class EarlyStopping(Callback): - """Stop training when a monitored quantity has stopped improving. - # Arguments - monitor: quantity to be monitored. - min_delta: minimum change in the monitored quantity - to qualify as an improvement, i.e. an absolute - change of less than min_delta, will count as no - improvement. - patience: number of epochs with no improvement - after which training will be stopped. - verbose: verbosity mode. - mode: one of {auto, min, max}. In `min` mode, - training will stop when the quantity - monitored has stopped decreasing; in `max` - mode it will stop when the quantity - monitored has stopped increasing; in `auto` - mode, the direction is automatically inferred - from the name of the monitored quantity. - """ - - def __init__(self, monitor='val_loss', - min_delta=0.0, patience=0, verbose=0, mode='auto'): - super(EarlyStopping, self).__init__() - - self.monitor = monitor - self.patience = patience - self.verbose = verbose - self.min_delta = min_delta - self.wait = 0 - self.stopped_epoch = 0 - - if mode not in ['auto', 'min', 'max']: - print('EarlyStopping mode %s is unknown, fallback to auto mode.' % mode) - mode = 'auto' - - if mode == 'min': - self.monitor_op = np.less - elif mode == 'max': - self.monitor_op = np.greater - else: - if 'acc' in self.monitor: - self.monitor_op = np.greater - else: - self.monitor_op = np.less - - if self.monitor_op == np.greater: - self.min_delta *= 1 - else: - self.min_delta *= -1 - - self.on_train_begin() - - def on_train_begin(self, logs=None): - # Allow instances to be re-used - self.wait = 0 - self.stopped_epoch = 0 - self.best = np.Inf if self.monitor_op == np.less else -np.Inf - - def on_epoch_end(self, epoch, logs=None): - current = logs.get(self.monitor) - stop_training = False - if current is None: - print('Early stopping conditioned on metric `%s` ''which is not available. Available metrics are: %s' % - (self.monitor, ','.join(list(logs.keys()))), RuntimeWarning - ) - exit(-1) - - if self.monitor_op(current - self.min_delta, self.best): - self.best = current - self.wait = 0 - else: - self.wait += 1 - if self.wait >= self.patience: - self.stopped_epoch = epoch - stop_training = True - self.on_train_end() - - return stop_training - - def on_train_end(self, logs=None): - if self.stopped_epoch > 0 and self.verbose > 0: - print('Epoch %05d: early stopping' % (self.stopped_epoch + 1)) - - -class ModelCheckpoint(Callback): - """Save the model after every epoch. - `filepath` can contain named formatting options, - which will be filled the value of `epoch` and - keys in `logs` (passed in `on_epoch_end`). - For example: if `filepath` is `weights.{epoch:02d}-{val_loss:.2f}.hdf5`, - then the model checkpoints will be saved with the epoch number and - the validation loss in the filename. - # Arguments - filepath: string, path to save the model file. - monitor: quantity to monitor. - verbose: verbosity mode, 0 or 1. - save_best_only: if `save_best_only=True`, - the latest best model according to - the quantity monitored will not be overwritten. - mode: one of {auto, min, max}. - If `save_best_only=True`, the decision - to overwrite the current save file is made - based on either the maximization or the - minimization of the monitored quantity. For `val_acc`, - this should be `max`, for `val_loss` this should - be `min`, etc. In `auto` mode, the direction is - automatically inferred from the name of the monitored quantity. - save_weights_only: if True, then only the model's weights will be - saved (`model.save_weights(filepath)`), else the full model - is saved (`model.save(filepath)`). - period: Interval (number of epochs) between checkpoints. - """ - - def __init__(self, filepath, save_function, monitor='val_loss', verbose=0, - save_best_only=False, save_weights_only=False, - mode='auto', period=1, prefix=''): - super(ModelCheckpoint, self).__init__() - self.monitor = monitor - self.save_function = save_function - self.verbose = verbose - self.filepath = filepath - self.save_best_only = save_best_only - self.save_weights_only = save_weights_only - self.period = period - self.epochs_since_last_save = 0 - self.prefix = prefix - - if mode not in ['auto', 'min', 'max']: - print('ModelCheckpoint mode %s is unknown, ' - 'fallback to auto mode.' % (mode), - RuntimeWarning) - mode = 'auto' - - if mode == 'min': - self.monitor_op = np.less - self.best = np.Inf - elif mode == 'max': - self.monitor_op = np.greater - self.best = -np.Inf - else: - if 'acc' in self.monitor or self.monitor.startswith('fmeasure'): - self.monitor_op = np.greater - self.best = -np.Inf - else: - self.monitor_op = np.less - self.best = np.Inf - - def save_model(self, filepath, overwrite): - dirpath = '/'.join(filepath.split('/')[:-1]) - - # make paths - os.makedirs(os.path.dirname(filepath), exist_ok=True) - - if overwrite: - for filename in os.listdir(dirpath): - if self.prefix in filename: - path_to_delete = os.path.join(dirpath, filename) - try: - shutil.rmtree(path_to_delete) - except OSError: - os.remove(path_to_delete) - - # delegate the saving to the model - self.save_function(filepath) - - def on_epoch_end(self, epoch, logs=None): - logs = logs or {} - self.epochs_since_last_save += 1 - if self.epochs_since_last_save >= self.period: - self.epochs_since_last_save = 0 - filepath = '{}/{}_ckpt_epoch_{}.ckpt'.format(self.filepath, self.prefix, epoch + 1) - if self.save_best_only: - current = logs.get(self.monitor) - if current is None: - print('Can save best model only with %s available, ' - 'skipping.' % (self.monitor), RuntimeWarning) - else: - if self.monitor_op(current, self.best): - if self.verbose > 0: - print('\nEpoch %05d: %s improved from %0.5f to %0.5f,' - ' saving model to %s' - % (epoch + 1, self.monitor, self.best, - current, filepath)) - self.best = current - self.save_model(filepath, overwrite=True) - - else: - if self.verbose > 0: - print('\nEpoch %05d: %s did not improve' % - (epoch + 1, self.monitor)) - else: - if self.verbose > 0: - print('\nEpoch %05d: saving model to %s' % (epoch + 1, filepath)) - self.save_model(filepath, overwrite=False) - - -if __name__ == '__main__': - c = EarlyStopping(min_delta=0.9, patience=2, verbose=True) - losses = [10, 9, 8, 8, 6, 4.3, 5, 4.4, 2.8, 2.5] - for i, loss in enumerate(losses): - should_stop = c.on_epoch_end(i, logs={'val_loss': loss}) - print(loss) - if should_stop: - break -