458 lines
19 KiB
Python
458 lines
19 KiB
Python
# coding: utf8
|
||
from copy import deepcopy
|
||
from collections import Counter, OrderedDict
|
||
import six
|
||
import torch
|
||
from tqdm import tqdm
|
||
|
||
from .dataset import Dataset
|
||
from .pipeline import Pipeline
|
||
from .utils import get_tokenizer
|
||
from ..vocab import Vocab, SubwordVocab
|
||
|
||
|
||
class RawField(object):
|
||
""" Defines a general datatype.
|
||
|
||
Every dataset consists of one or more types of data. For instance, a text
|
||
classification dataset contains sentences and their classes, while a
|
||
machine translation dataset contains paired examples of text in two
|
||
languages. Each of these types of data is represented by an RawField object.
|
||
An RawField object does not assume any property of the data type and
|
||
it holds parameters relating to how a datatype should be processed.
|
||
|
||
Attributes:
|
||
preprocessing: The Pipeline that will be applied to examples
|
||
using this field before creating an example.
|
||
Default: None.
|
||
postprocessing: A Pipeline that will be applied to a list of examples
|
||
using this field before assigning to a batch.
|
||
Function signature: (batch(list)) -> object
|
||
Default: None.
|
||
"""
|
||
|
||
def __init__(self, preprocessing=None, postprocessing=None):
|
||
self.preprocessing = preprocessing
|
||
self.postprocessing = postprocessing
|
||
|
||
def preprocess(self, x):
|
||
""" Preprocess an example if the `preprocessing` Pipeline is provided. """
|
||
if self.preprocessing is not None:
|
||
return self.preprocessing(x)
|
||
else:
|
||
return x
|
||
|
||
def process(self, batch, *args, **kargs):
|
||
""" Process a list of examples to create a batch.
|
||
|
||
Postprocess the batch with user-provided Pipeline.
|
||
|
||
Args:
|
||
batch (list(object)): A list of object from a batch of examples.
|
||
Returns:
|
||
data (object): Processed object given the input and custom
|
||
postprocessing Pipeline.
|
||
"""
|
||
if self.postprocessing is not None:
|
||
batch = self.postprocessing(batch)
|
||
return batch
|
||
|
||
|
||
class Field(RawField):
|
||
"""Defines a datatype together with instructions for converting to Tensor.
|
||
|
||
Field class models common text processing datatypes that can be represented
|
||
by tensors. It holds a Vocab object that defines the set of possible values
|
||
for elements of the field and their corresponding numerical representations.
|
||
The Field object also holds other parameters relating to how a datatype
|
||
should be numericalized, such as a tokenization method and the kind of
|
||
Tensor that should be produced.
|
||
|
||
If a Field is shared between two columns in a dataset (e.g., question and
|
||
answer in a QA dataset), then they will have a shared vocabulary.
|
||
|
||
Attributes:
|
||
sequential: Whether the datatype represents sequential data. If False,
|
||
no tokenization is applied. Default: True.
|
||
use_vocab: Whether to use a Vocab object. If False, the data in this
|
||
field should already be numerical. Default: True.
|
||
init_token: A token that will be prepended to every example using this
|
||
field, or None for no initial token. Default: None.
|
||
eos_token: A token that will be appended to every example using this
|
||
field, or None for no end-of-sentence token. Default: None.
|
||
fix_length: A fixed length that all examples using this field will be
|
||
padded to, or None for flexible sequence lengths. Default: None.
|
||
tensor_type: The torch.Tensor class that represents a batch of examples
|
||
of this kind of data. Default: torch.LongTensor.
|
||
preprocessing: The Pipeline that will be applied to examples
|
||
using this field after tokenizing but before numericalizing. Many
|
||
Datasets replace this attribute with a custom preprocessor.
|
||
Default: None.
|
||
postprocessing: A Pipeline that will be applied to examples using
|
||
this field after numericalizing but before the numbers are turned
|
||
into a Tensor. The pipeline function takes the batch as a list,
|
||
the field's Vocab, and train (a bool).
|
||
Default: None.
|
||
lower: Whether to lowercase the text in this field. Default: False.
|
||
tokenize: The function used to tokenize strings using this field into
|
||
sequential examples. If "spacy", the SpaCy English tokenizer is
|
||
used. Default: str.split.
|
||
include_lengths: Whether to return a tuple of a padded minibatch and
|
||
a list containing the lengths of each examples, or just a padded
|
||
minibatch. Default: False.
|
||
batch_first: Whether to produce tensors with the batch dimension first.
|
||
Default: False.
|
||
pad_token: The string token used as padding. Default: "<pad>".
|
||
unk_token: The string token used to represent OOV words. Default: "<unk>".
|
||
pad_first: Do the padding of the sequence at the beginning. Default: False.
|
||
"""
|
||
|
||
vocab_cls = Vocab
|
||
# Dictionary mapping PyTorch tensor types to the appropriate Python
|
||
# numeric type.
|
||
tensor_types = {
|
||
torch.FloatTensor: float,
|
||
torch.cuda.FloatTensor: float,
|
||
torch.DoubleTensor: float,
|
||
torch.cuda.DoubleTensor: float,
|
||
torch.HalfTensor: float,
|
||
torch.cuda.HalfTensor: float,
|
||
|
||
torch.ByteTensor: int,
|
||
torch.cuda.ByteTensor: int,
|
||
torch.CharTensor: int,
|
||
torch.cuda.CharTensor: int,
|
||
torch.ShortTensor: int,
|
||
torch.cuda.ShortTensor: int,
|
||
torch.IntTensor: int,
|
||
torch.cuda.IntTensor: int,
|
||
torch.LongTensor: int,
|
||
torch.cuda.LongTensor: int
|
||
}
|
||
|
||
def __init__(
|
||
self, sequential=True, use_vocab=True, init_token=None,
|
||
eos_token=None, fix_length=None, tensor_type=torch.LongTensor,
|
||
preprocessing=None, postprocessing=None, lower=False,
|
||
tokenize=(lambda s: s.split()), include_lengths=False,
|
||
batch_first=False, pad_token="<pad>", unk_token="<unk>",
|
||
pad_first=False, decap=False, numerical=False):
|
||
self.sequential = sequential
|
||
self.numerical = numerical
|
||
self.use_vocab = use_vocab
|
||
self.init_token = init_token
|
||
self.eos_token = eos_token
|
||
self.unk_token = unk_token
|
||
self.fix_length = fix_length
|
||
self.tensor_type = tensor_type
|
||
self.preprocessing = preprocessing
|
||
self.postprocessing = postprocessing
|
||
self.lower = lower
|
||
self.tokenize = get_tokenizer(tokenize)
|
||
self.include_lengths = include_lengths
|
||
self.batch_first = batch_first
|
||
self.pad_token = pad_token if self.sequential else None
|
||
self.pad_first = pad_first
|
||
|
||
def preprocess(self, x, tokenize=None):
|
||
"""Load a single example using this field, tokenizing if necessary.
|
||
|
||
If the input is a Python 2 `str`, it will be converted to Unicode
|
||
first. If `sequential=True`, it will be tokenized. Then the input
|
||
will be optionally lowercased and passed to the user-provided
|
||
`preprocessing` Pipeline."""
|
||
if (six.PY2 and isinstance(x, six.string_types) and not
|
||
isinstance(x, six.text_type)):
|
||
x = Pipeline(lambda s: six.text_type(s, encoding='utf-8'))(x)
|
||
if self.sequential and isinstance(x, six.text_type):
|
||
if tokenize is None:
|
||
tokenize = self.tokenize
|
||
x = tokenize(x.rstrip('\n'))
|
||
if self.lower:
|
||
x = Pipeline(six.text_type.lower)(x)
|
||
if self.preprocessing is not None:
|
||
return self.preprocessing(x)
|
||
else:
|
||
return x
|
||
|
||
def process(self, batch, device, train, **kwargs):
|
||
""" Process a list of examples to create a torch.Tensor.
|
||
|
||
Pad, numericalize, and postprocess a batch and create a tensor.
|
||
|
||
Args:
|
||
batch (list(object)): A list of object from a batch of examples.
|
||
Returns:
|
||
data (torch.autograd.Varaible): Processed object given the input
|
||
and custom postprocessing Pipeline.
|
||
"""
|
||
if self.numerical:
|
||
if isinstance(batch[0], list):
|
||
pad_value = max([max(example) for example in batch]) + 1000
|
||
batch = deepcopy(batch)
|
||
for example in batch:
|
||
if self.init_token is not None:
|
||
for idx, ex in enumerate(example):
|
||
example[idx] += 1
|
||
|
||
max_len = max([len(example) for example in batch])
|
||
for example in batch:
|
||
if len(example) < max_len:
|
||
example += [pad_value] * (max_len - len(example))
|
||
tensor = torch.LongTensor(batch)
|
||
tensor = tensor.to(device)
|
||
else:
|
||
padded = self.pad(batch)
|
||
tensor = self.numericalize(padded, device=device, train=train, **kwargs)
|
||
return tensor
|
||
|
||
def pad(self, minibatch):
|
||
"""Pad a batch of examples using this field.
|
||
|
||
Pads to self.fix_length if provided, otherwise pads to the length of
|
||
the longest example in the batch. Prepends self.init_token and appends
|
||
self.eos_token if those attributes are not None. Returns a tuple of the
|
||
padded list and a list containing lengths of each example if
|
||
`self.include_lengths` is `True` and `self.sequential` is `True`, else just
|
||
returns the padded list. If `self.sequential` is `False`, no padding is applied.
|
||
"""
|
||
minibatch = list(minibatch)
|
||
if not self.sequential:
|
||
return minibatch
|
||
if self.fix_length is None:
|
||
max_len = max(len(x) for x in minibatch)
|
||
else:
|
||
max_len = self.fix_length + (
|
||
self.init_token, self.eos_token).count(None) - 2
|
||
padded, lengths = [], []
|
||
for x in minibatch:
|
||
if self.pad_first:
|
||
padded.append(
|
||
[self.pad_token] * max(0, max_len - len(x)) +
|
||
([] if self.init_token is None else [self.init_token]) +
|
||
list(x[:max_len]) +
|
||
([] if self.eos_token is None else [self.eos_token]))
|
||
else:
|
||
padded.append(
|
||
([] if self.init_token is None else [self.init_token]) +
|
||
list(x[:max_len]) +
|
||
([] if self.eos_token is None else [self.eos_token]) +
|
||
[self.pad_token] * max(0, max_len - len(x)))
|
||
lengths.append(len(padded[-1]) - max(0, max_len - len(x)))
|
||
if self.include_lengths:
|
||
return (padded, lengths)
|
||
return padded
|
||
|
||
def build_vocab(self, *args, **kwargs):
|
||
"""Construct the Vocab object for this field from one or more datasets.
|
||
|
||
Arguments:
|
||
Positional arguments: Dataset objects or other iterable data
|
||
sources from which to construct the Vocab object that
|
||
represents the set of possible values for this field. If
|
||
a Dataset object is provided, all columns corresponding
|
||
to this field are used; individual columns can also be
|
||
provided directly.
|
||
Remaining keyword arguments: Passed to the constructor of Vocab.
|
||
"""
|
||
counter = Counter()
|
||
sources = []
|
||
for arg in args:
|
||
if hasattr(arg, 'fields'):
|
||
sources += [getattr(arg, name) for name, field in
|
||
arg.fields.items() if field is self]
|
||
else:
|
||
sources.append(arg)
|
||
for data in sources:
|
||
for x in data:
|
||
if not self.sequential:
|
||
x = [x]
|
||
counter.update(x)
|
||
specials = [self.unk_token, self.pad_token, self.init_token, self.eos_token]
|
||
specials = list(OrderedDict.fromkeys(tok for tok in specials if tok is not None))
|
||
self.vocab = self.vocab_cls(counter, specials=specials, **kwargs)
|
||
|
||
def append_vocab(self, other_field):
|
||
for w, count in other_field.vocab.stoi.items():
|
||
if w not in self.vocab.stoi:
|
||
self.vocab.stoi[w] = len(self.vocab.itos)
|
||
self.vocab.itos.append(w)
|
||
|
||
|
||
def vocab_from_counter(self, counter, **kwargs):
|
||
specials = list(OrderedDict.fromkeys(
|
||
tok for tok in [self.unk_token, self.pad_token, self.init_token,
|
||
self.eos_token]
|
||
if tok is not None))
|
||
self.vocab = self.vocab_cls(counter, specials=specials, **kwargs)
|
||
|
||
|
||
def numericalize(self, arr, device=None, train=True, limited=None, l2f=None, oov2l=None):
|
||
"""Turn a batch of examples that use this field into a Variable.
|
||
|
||
If the field has include_lengths=True, a tensor of lengths will be
|
||
included in the return value.
|
||
|
||
Arguments:
|
||
arr (List[List[str]], or tuple of (List[List[str]], List[int])):
|
||
List of tokenized and padded examples, or tuple of List of
|
||
tokenized and padded examples and List of lengths of each
|
||
example if self.include_lengths is True.
|
||
device (-1 or None): Device to create the Variable's Tensor on.
|
||
Use -1 for CPU and None for the currently active GPU device.
|
||
Default: None.
|
||
train (boolean): Whether the batch is for a training set.
|
||
If False, the Variable will be created with volatile=True.
|
||
Default: True.
|
||
"""
|
||
if limited is None:
|
||
limited = self.vocab.stoi
|
||
if self.include_lengths and not isinstance(arr, tuple):
|
||
raise ValueError("Field has include_lengths set to True, but "
|
||
"input data is not a tuple of "
|
||
"(data batch, batch lengths).")
|
||
if isinstance(arr, tuple):
|
||
arr, lengths = arr
|
||
# lengths = torch.LongTensor(lengths)
|
||
|
||
if self.use_vocab:
|
||
if self.sequential:
|
||
def limited_idx(x):
|
||
if x in limited:
|
||
lim_idx = limited[x]
|
||
elif x in oov2l:
|
||
lim_idx = oov2l[x]
|
||
else:
|
||
lim_idx = len(limited) + len(oov2l)
|
||
oov2l[x] = lim_idx
|
||
if x not in self.vocab.stoi:
|
||
self.vocab.stoi[x] = len(self.vocab.itos)
|
||
self.vocab.itos.append(x)
|
||
l2f[lim_idx] = self.vocab.stoi[x]
|
||
return lim_idx
|
||
|
||
lim_arr = [[limited_idx(x) for x in ex] for ex in arr]
|
||
num = [[self.vocab.stoi[x] for x in ex] for ex in arr]
|
||
|
||
# arr = [[self.vocab.stoi[x] for x in ex] for ex in arr]
|
||
else:
|
||
num = [self.vocab.stoi[x] for x in arr]
|
||
|
||
if self.postprocessing is not None:
|
||
num = self.postprocessing(num, self.vocab, train)
|
||
else:
|
||
if self.tensor_type not in self.tensor_types:
|
||
raise ValueError(
|
||
"Specified Field tensor_type {} can not be used with "
|
||
"use_vocab=False because we do not know how to numericalize it. "
|
||
"Please raise an issue at "
|
||
"https://github.com/pytorch/text/issues".format(self.tensor_type))
|
||
numericalization_func = self.tensor_types[self.tensor_type]
|
||
# It doesn't make sense to explictly coerce to a numeric type if
|
||
# the data is sequential, since it's unclear how to coerce padding tokens
|
||
# to a numeric type.
|
||
if not self.sequential:
|
||
num = [numericalization_func(x) if isinstance(x, six.string_types)
|
||
else x for x in arr]
|
||
if self.postprocessing is not None:
|
||
num = self.postprocessing(num, None, train)
|
||
|
||
num = self.tensor_type(num)
|
||
lim_arr = self.tensor_type(lim_arr)
|
||
if self.sequential and not self.batch_first:
|
||
num.t_()
|
||
lim_arr.t_()
|
||
if self.sequential:
|
||
num = num.contiguous()
|
||
lim_arr = lim_arr.contiguous()
|
||
num = num.to(device)
|
||
lim_arr = lim_arr.to(device)
|
||
# if self.include_lengths:
|
||
# lengths = lengths.cuda(device)
|
||
if self.include_lengths:
|
||
return num, lengths, lim_arr, arr
|
||
return arr
|
||
|
||
|
||
class ReversibleField(Field):
|
||
|
||
def __init__(self, **kwargs):
|
||
if kwargs.get('tokenize') is list:
|
||
self.use_revtok = False
|
||
else:
|
||
self.use_revtok = True
|
||
if kwargs.get('tokenize') is None:
|
||
kwargs['tokenize'] = 'revtok'
|
||
if 'unk_token' not in kwargs:
|
||
kwargs['unk_token'] = ' UNK '
|
||
if self.use_revtok:
|
||
try:
|
||
import revtok
|
||
except ImportError:
|
||
print("Please install revtok.")
|
||
raise
|
||
self.detokenize = revtok.detokenize
|
||
else:
|
||
self.detokenize = None
|
||
super(ReversibleField, self).__init__(**kwargs)
|
||
|
||
def reverse(self, batch, detokenize=None, limited=False):
|
||
|
||
if not self.batch_first:
|
||
batch = batch.t()
|
||
with torch.cuda.device_of(batch):
|
||
batch = batch.tolist()
|
||
batch = [[self.vocab.itos[ind] for ind in ex] for ex in batch] # denumericalize
|
||
|
||
def trim(s, t):
|
||
sentence = []
|
||
for w in s:
|
||
if w == t:
|
||
break
|
||
sentence.append(w)
|
||
return sentence
|
||
|
||
batch = [trim(ex, self.eos_token) for ex in batch] # trim past frst eos
|
||
|
||
def filter_special(tok):
|
||
return tok not in (self.init_token, self.pad_token)
|
||
|
||
batch = [filter(filter_special, ex) for ex in batch]
|
||
if detokenize is None:
|
||
detokenize = self.detokenize
|
||
if detokenize is not None:
|
||
return [detokenize(ex) for ex in batch]
|
||
else:
|
||
return [''.join(ex) for ex in batch]
|
||
|
||
|
||
class SubwordField(ReversibleField):
|
||
|
||
vocab_cls = SubwordVocab
|
||
|
||
def __init__(self, **kwargs):
|
||
kwargs['tokenize'] = 'subword'
|
||
if 'unk_token' not in kwargs:
|
||
kwargs['unk_token'] = '<EFBFBD>'
|
||
super(SubwordField, self).__init__(**kwargs)
|
||
|
||
def segment(self, *args):
|
||
"""Segment one or more datasets with this subword field.
|
||
|
||
Arguments:
|
||
Positional arguments: Dataset objects or other indexable
|
||
mutable sequences to segment. If a Dataset object is provided,
|
||
all columns corresponding to this field are used; individual
|
||
columns can also be provided directly.
|
||
"""
|
||
sources = []
|
||
for arg in args:
|
||
if isinstance(arg, Dataset):
|
||
sources += [getattr(arg, name) for name, field in
|
||
arg.fields.items() if field is self]
|
||
else:
|
||
sources.append(arg)
|
||
for data in sources:
|
||
for x in tqdm(data, 'segmenting'):
|
||
x[:] = self.vocab.segment(x)
|