diff --git a/docs/Trainer/debugging.md b/docs/Trainer/debugging.md index 296938ba3e..bc63930b4c 100644 --- a/docs/Trainer/debugging.md +++ b/docs/Trainer/debugging.md @@ -38,6 +38,14 @@ trainer = Trainer(overfit_pct=0.01) #### Print the parameter count by layer By default lightning prints a list of parameters *and submodules* when it starts training. +``` {.python} +# DEFAULT print a full list of all submodules and their parameters. +trainer = Trainer(weights_summary='full') + +# only print the top-level modules (i.e. the children of LightningModule). +trainer = Trainer(weights_summary='top') +``` + --- #### Print which gradients are nan This option prints a list of tensors with nan gradients. diff --git a/pytorch_lightning/root_module/memory.py b/pytorch_lightning/root_module/memory.py index ec4e2c0297..a4e94e3f00 100644 --- a/pytorch_lightning/root_module/memory.py +++ b/pytorch_lightning/root_module/memory.py @@ -12,11 +12,12 @@ import pandas as pd class ModelSummary(object): - def __init__(self, model): + def __init__(self, model, mode='full'): ''' Generates summaries of model layers and dimensions. ''' self.model = model + self.mode = mode self.in_sizes = [] self.out_sizes = [] @@ -28,9 +29,20 @@ class ModelSummary(object): def __repr__(self): return self.summary.__str__() + def named_modules(self): + if self.mode == 'full': + mods = self.model.named_modules() + mods = list(mods)[1:] # do not include root module (LightningModule) + elif self.mode == 'top': + # the children are the top-level modules + mods = self.model.named_children() + else: + mods = [] + return list(mods) + def get_variable_sizes(self): '''Run sample input through each layer to get output sizes''' - mods = list(self.model.modules()) + mods = self.named_modules() in_sizes = [] out_sizes = [] input_ = self.model.example_input_array @@ -43,8 +55,7 @@ class ModelSummary(object): with torch.no_grad(): - for i in range(1, len(mods)): - m = mods[i] + for _, m in mods: if type(input_) is list or type(input_) is tuple: # pragma: no cover out = m(*input_) else: @@ -72,16 +83,17 @@ class ModelSummary(object): self.in_sizes = in_sizes self.out_sizes = out_sizes + assert len(in_sizes) == len(out_sizes) return def get_layer_names(self): '''Collect Layer Names''' - mods = list(self.model.named_modules()) + mods = self.named_modules() names = [] layers = [] - for m in mods[1:]: - names += [m[0]] - layers += [str(m[1].__class__)] + for name, m in mods: + names += [name] + layers += [str(m.__class__)] layer_types = [x.split('.')[-1][:-2] for x in layers] @@ -91,11 +103,9 @@ class ModelSummary(object): def get_parameter_sizes(self): '''Get sizes of all parameters in `model`''' - mods = list(self.model.modules()) + mods = self.named_modules() sizes = [] - - for i in range(1, len(mods)): - m = mods[i] + for _, m in mods: p = list(m.parameters()) modsz = [] for j in range(len(p)): @@ -133,6 +143,7 @@ class ModelSummary(object): df['Name'] = self.layer_names df['Type'] = self.layer_types df['Params'] = self.param_nums + df['Params'] = df['Params'].map(get_human_readable_count) if self.model.example_input_array is not None: @@ -226,3 +237,28 @@ def get_gpu_memory_map(): k = f'gpu_{k}' gpu_memory_map[k] = v return gpu_memory_map + + +def get_human_readable_count(number): + """ + Abbreviates an integer number with K, M, B, T for thousands, millions, + billions and trillions, respectively. + Examples: + 123 -> 123 + 1234 -> 1 K (one thousand) + 2e6 -> 2 M (two million) + 3e9 -> 3 B (three billion) + 4e12 -> 4 T (four trillion) + 5e15 -> 5,000 T + :param number: a positive integer number + :returns a string formatted according to the pattern described above. + """ + assert number >= 0 + labels = [' ', 'K', 'M', 'B', 'T'] + num_digits = int(np.floor(np.log10(number)) + 1 if number > 0 else 1) + num_groups = int(np.ceil(num_digits / 3)) + num_groups = min(num_groups, len(labels)) # don't abbreviate beyond trillions + shift = -3 * (num_groups - 1) + number = number * (10 ** shift) + index = num_groups - 1 + return f'{int(number):,d} {labels[index]}' diff --git a/pytorch_lightning/root_module/root_module.py b/pytorch_lightning/root_module/root_module.py index c4e836bcd5..aa7ddb834e 100644 --- a/pytorch_lightning/root_module/root_module.py +++ b/pytorch_lightning/root_module/root_module.py @@ -159,8 +159,8 @@ class LightningModule(GradInformation, ModelIO, ModelHooks): return model - def summarize(self): - model_summary = ModelSummary(self) + def summarize(self, mode): + model_summary = ModelSummary(self, mode=mode) print(model_summary) def freeze(self): diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index c9fb507701..7a7c8eed7c 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -84,7 +84,7 @@ class Trainer(TrainerIO): distributed_backend=None, use_amp=False, print_nan_grads=False, - print_weights_summary=True, + weights_summary='full', weights_save_path=None, amp_level='O1', nb_sanity_val_steps=5): @@ -116,7 +116,7 @@ class Trainer(TrainerIO): :param distributed_backend: str. Options: 'dp', 'ddp', 'ddp2'. :param use_amp: Bool. If true uses apex for 16bit precision :param print_nan_grads: Bool. Prints nan gradients - :param print_weights_summary: Bool. Prints summary of weights + :param weights_summary: str. Options: 'full', 'top'. :param weights_save_path: Bool. Where to save weights if on cluster :param amp_level: str. Check nvidia docs for level :param nb_sanity_val_steps: int. How many val steps before a full train loop. @@ -131,7 +131,7 @@ class Trainer(TrainerIO): self.fast_dev_run = fast_dev_run self.on_gpu = gpus is not None and torch.cuda.is_available() self.process_position = process_position - self.print_weights_summary = print_weights_summary + self.weights_summary = weights_summary self.max_nb_epochs = max_nb_epochs self.min_nb_epochs = min_nb_epochs self.nb_sanity_val_steps = nb_sanity_val_steps @@ -981,8 +981,8 @@ class Trainer(TrainerIO): self.__layout_bookeeping() # print model summary - if self.proc_rank == 0 and self.print_weights_summary: - ref_model.summarize() + if self.proc_rank == 0 and self.weights_summary in ['full', 'top']: + ref_model.summarize(mode=self.weights_summary) # link up experiment object if self.logger is not None: