2018-06-20 06:22:34 +00:00
from text import torchtext
import time
import os
import sys
import torch
import random
import numpy as np
2018-11-07 23:06:41 +00:00
from text.torchtext.data.utils import get_tokenizer
def tokenizer(s):
return s.split()
2018-06-20 06:22:34 +00:00
def get_context_question(ex, context, question, field):
return ex.context_special + ex.context + ex.question_special + ex.question
def preprocess_examples(args, tasks, splits, field, logger=None, train=True):
min_length = 1
max_context_length = args.max_train_context_length if train else args.max_val_context_length
2018-11-07 23:06:41 +00:00
is_too_long = lambda ex: (len(ex.answer) > args.max_answer_length or
2018-06-20 06:22:34 +00:00
2018-11-07 23:06:41 +00:00
is_too_short = lambda ex: (len(ex.answer) < min_length or
2018-06-20 06:22:34 +00:00
for task, s in zip(tasks, splits):
if logger is not None:
logger.info(f'{task} has {len(s.examples)} examples')
if 'cnn' in task or 'dailymail' in task or 'imdb' in task:
for x in s.examples:
x.context = x.context[:max_context_length]
if train:
l = len(s.examples)
s.examples = [ex for ex in s.examples if not is_too_long(ex)]
if len(s.examples) < l:
if logger is not None:
logger.info(f'Filtering out long {task} examples: {l} -> {len(s.examples)}')
l = len(s.examples)
s.examples = [ex for ex in s.examples if not is_too_short(ex)]
if len(s.examples) < l:
if logger is not None:
logger.info(f'Filtering out short {task} examples: {l} -> {len(s.examples)}')
l = len(s.examples)
s.examples = [ex for ex in s.examples if 'This page includes the show' not in ex.answer]
if len(s.examples) < l:
if logger is not None:
logger.info(f'Filtering {task} examples with a dummy summary: {l} -> {len(s.examples)} ')
if logger is not None:
context_lengths = [len(ex.context) for ex in s.examples]
question_lengths = [len(ex.question) for ex in s.examples]
answer_lengths = [len(ex.answer) for ex in s.examples]
logger.info(f'{task} context lengths (min, mean, max): {np.min(context_lengths)}, {int(np.mean(context_lengths))}, {np.max(context_lengths)}')
logger.info(f'{task} question lengths (min, mean, max): {np.min(question_lengths)}, {int(np.mean(question_lengths))}, {np.max(question_lengths)}')
logger.info(f'{task} answer lengths (min, mean, max): {np.min(answer_lengths)}, {int(np.mean(answer_lengths))}, {np.max(answer_lengths)}')
for x in s.examples:
x.context_question = get_context_question(x, x.context, x.question, field)
if logger is not None:
logger.info('Tokenized examples:')
for ex in s.examples[:10]:
logger.info('Context: ' + ' '.join(ex.context))
logger.info('Question: ' + ' '.join(ex.question))
logger.info(' '.join(ex.context_question))
logger.info('Answer: ' + ' '.join(ex.answer))
def set_seed(args, rank=None):
2018-10-23 23:21:26 +00:00
if rank is None and len(args.devices) > 0:
ordinal = args.devices[0]
2018-06-20 06:22:34 +00:00
2018-10-23 23:21:26 +00:00
ordinal = args.devices[rank]
2018-09-18 00:30:36 +00:00
device = torch.device(f'cuda:{ordinal}' if ordinal > -1 else 'cpu')
2018-06-20 06:22:34 +00:00
print(f'device: {device}')
2018-09-18 00:30:36 +00:00
with torch.cuda.device(ordinal):
return device
2018-06-20 06:22:34 +00:00
def count_params(params):
def mult(ps):
r = 0
for p in ps:
this_r = 1
for s in p.size():
this_r *= s
r += this_r
return r
return mult(params)
def get_trainable_params(model):
return list(filter(lambda p: p.requires_grad, model.parameters()))
def elapsed_time(log):
t = time.time() - log.start
day = int(t // (24 * 3600))
t = t % (24 * 3600)
hour = int(t // 3600)
t %= 3600
minutes = int(t // 60)
t %= 60
seconds = int(t)
return f'{day:02}:{hour:02}:{minutes:02}:{seconds:02}'
def get_splits(args, task, FIELD, **kwargs):
2018-11-07 23:06:41 +00:00
kwargs['skip_cache_bool'] = args.skip_cache_bool
2018-06-20 06:22:34 +00:00
if 'multi30k' in task:
src, trg = ['.'+x for x in task.split('.')[1:]]
split = torchtext.datasets.generic.Multi30k.splits(exts=(src, trg),
fields=FIELD, root=args.data, **kwargs)
2018-08-21 22:59:31 +00:00
elif 'iwslt' in task:
2018-06-20 06:22:34 +00:00
src, trg = ['.'+x for x in task.split('.')[1:]]
split = torchtext.datasets.generic.IWSLT.splits(exts=(src, trg),
fields=FIELD, root=args.data, **kwargs)
2018-11-07 23:06:41 +00:00
elif 'almond' in task:
setattr(FIELD, 'use_revtok', False)
setattr(FIELD, 'tokenize', tokenizer)
if args.reverse_task_bool:
src, trg = '.tt', '.en' # for the reverse task
src, trg = '.en', '.tt'
split = torchtext.datasets.generic.Almond.splits(exts=(src, trg),
fields=FIELD, root=args.data, **kwargs)
setattr(FIELD, 'use_revtok', True)
setattr(FIELD, 'tokenize', get_tokenizer('revtok'))
2018-08-21 22:59:31 +00:00
elif 'squad' in task:
2018-06-20 06:22:34 +00:00
split = torchtext.datasets.generic.SQuAD.splits(
2018-08-25 00:53:01 +00:00
fields=FIELD, root=args.data, description=task, **kwargs)
2018-09-07 00:18:00 +00:00
elif 'wikisql' in task:
2018-06-20 06:22:34 +00:00
split = torchtext.datasets.generic.WikiSQL.splits(
2018-09-07 00:18:00 +00:00
fields=FIELD, root=args.data, query_as_question='query_as_question' in task, **kwargs)
2018-08-21 22:59:31 +00:00
elif 'ontonotes.ner' in task:
2018-06-20 06:22:34 +00:00
split_task = task.split('.')
_, _, subtask, nones, counting = split_task
split = torchtext.datasets.generic.OntoNotesNER.splits(
subtask=subtask, nones=True if nones == 'nones' else False,
fields=FIELD, root=args.data, **kwargs)
2018-08-21 22:59:31 +00:00
elif 'woz' in task:
2018-06-20 06:22:34 +00:00
split = torchtext.datasets.generic.WOZ.splits(description=task,
fields=FIELD, root=args.data, **kwargs)
2018-08-21 22:59:31 +00:00
elif 'multinli' in task:
2018-06-20 06:22:34 +00:00
split = torchtext.datasets.generic.MultiNLI.splits(description=task,
fields=FIELD, root=args.data, **kwargs)
2018-08-21 22:59:31 +00:00
elif 'srl' in task:
2018-06-20 06:22:34 +00:00
split = torchtext.datasets.generic.SRL.splits(
fields=FIELD, root=args.data, **kwargs)
2018-08-21 22:59:31 +00:00
elif 'snli' in task:
2018-06-20 06:22:34 +00:00
split = torchtext.datasets.generic.SNLI.splits(
fields=FIELD, root=args.data, **kwargs)
2018-08-21 22:59:31 +00:00
elif 'schema' in task:
2018-06-20 06:22:34 +00:00
split = torchtext.datasets.generic.WinogradSchema.splits(
fields=FIELD, root=args.data, **kwargs)
2018-08-21 22:59:31 +00:00
elif task == 'cnn':
2018-06-20 06:22:34 +00:00
split = torchtext.datasets.generic.CNN.splits(
fields=FIELD, root=args.data, **kwargs)
2018-08-21 22:59:31 +00:00
elif task == 'dailymail':
2018-06-20 06:22:34 +00:00
split = torchtext.datasets.generic.DailyMail.splits(
fields=FIELD, root=args.data, **kwargs)
2018-08-21 22:59:31 +00:00
elif task == 'cnn_dailymail':
2018-06-20 06:22:34 +00:00
split_cnn = torchtext.datasets.generic.CNN.splits(
fields=FIELD, root=args.data, **kwargs)
split_dm = torchtext.datasets.generic.DailyMail.splits(
fields=FIELD, root=args.data, **kwargs)
for scnn, sdm in zip(split_cnn, split_dm):
split = split_cnn
2018-08-21 22:59:31 +00:00
elif 'sst' in task:
2018-06-20 06:22:34 +00:00
split = torchtext.datasets.generic.SST.splits(
fields=FIELD, root=args.data, **kwargs)
2018-08-21 22:59:31 +00:00
elif 'imdb' in task:
2018-06-20 06:22:34 +00:00
kwargs['validation'] = None
split = torchtext.datasets.generic.IMDb.splits(
fields=FIELD, root=args.data, **kwargs)
2018-08-21 22:59:31 +00:00
elif 'zre' in task:
2018-06-20 06:22:34 +00:00
split = torchtext.datasets.generic.ZeroShotRE.splits(
fields=FIELD, root=args.data, **kwargs)
2018-08-16 19:42:37 +00:00
elif os.path.exists(os.path.join(args.data, task)):
split = torchtext.datasets.generic.JSON.splits(
fields=FIELD, root=args.data, name=task, **kwargs)
2018-06-20 06:22:34 +00:00
return split
def batch_fn(new, i, sofar):
prev_max_len = sofar / (i - 1) if i > 1 else 0
return max(len(new.context), 5*len(new.answer), prev_max_len) * i
def pad(x, new_channel, dim, val=None):
if x.size(dim) > new_channel:
x = x.narrow(dim, 0, new_channel)
channels = x.size()
assert (new_channel >= channels[dim])
if new_channel == channels[dim]:
return x
size = list(channels)
size[dim] = new_channel - size[dim]
padding = x.new(*size).fill_(val)
return torch.cat([x, padding], dim)