ltp.data.fields.sequence 源代码

#! /usr/bin/env python
# -*- coding: utf-8 -*_
# Author: Yunlong Feng <ylfeng@ir.hit.edu.cn>

from collections import Counter, OrderedDict
from itertools import chain
from typing import Union

import torch

from torchtext.vocab import Vocab
from ltp.const import PAD
from . import Field
from ltp.data.dataset import Dataset


def dtype_to_attr(dtype):
    # convert torch.dtype to dtype string id
    # e.g. torch.int32 -> "int32"
    # used for serialization
    _, dtype = str(dtype).split('.')
    return dtype


[文档]class SequenceField(Field, alias='sequence'):
    """
    序列 Field，通常为target


    :param name: Field name
    :param bos: Begin Of Sentence，默认为空
    :param eos: End Of Sentence，默认为空
    :param unk: Unknown Tag, 默认为空
    :param pad: 默认为 [PAD] 或 -1
    :param dtype: torch.dtype，可以使用字符串
    :param pad_bias: 做值域变换，将pad的值变到1，默认开启
    :param preprocessing: 预处理
    :param postprocessing: 后处理
    :param max_length: 是否padding到最大长度，None即为不做特殊处理，默认为None
    :param include_lengths: 是否返回 length，默认为False
    :param use_vocab: 是否使用词表，默认为True
    :param is_target: 是否为target，默认为True
    """

    vocab_cls = Vocab

    dtypes = {
        torch.float32: float,
        torch.float: float,
        torch.float64: float,
        torch.double: float,
        torch.float16: float,
        torch.half: float,

        torch.uint8: int,
        torch.int8: int,
        torch.int16: int,
        torch.short: int,
        torch.int32: int,
        torch.int: int,
        torch.int64: int,
        torch.long: int,
    }

    ignore = ['dtype']

    def __init__(self, name, bos: Union[str, int] = None, eos: Union[str, int] = None,
                 unk: Union[str, int] = None, pad: Union[str, int] = None,
                 dtype=torch.long, pad_bias=True, preprocessing=None, postprocessing=None,
                 max_length: int = None, include_lengths=False, labels=None,
                 use_vocab=True, is_target: bool = True, **kwargs):

        super(SequenceField, self).__init__(name, preprocessing, postprocessing, is_target)

        self.unk = unk if (isinstance(unk, str) and use_vocab) or (isinstance(unk, int) and not use_vocab) else None
        self.bos = bos if (isinstance(bos, str) and use_vocab) or (isinstance(bos, int) and not use_vocab) else None
        self.eos = eos if (isinstance(eos, str) and use_vocab) or (isinstance(eos, int) and not use_vocab) else None
        if use_vocab:
            self.pad = pad if isinstance(pad, str) else PAD
        elif not use_vocab:
            self.pad = pad if isinstance(pad, int) else -1

        if isinstance(dtype, str):
            self.dtype = getattr(torch, dtype)
        else:
            self.dtype = dtype
        self.use_vocab = use_vocab
        self.max_length = max_length
        self.include_lengths = include_lengths

        self.pad_bias = pad_bias

        if labels:
            counter = Counter()
            counter.update(labels)
            specials = list(
                OrderedDict.fromkeys(
                    tok for tok in [self.unk, self.pad, self.bos, self.eos] + kwargs.pop('specials', []) if
                    tok is not None)
            )
            self.vocab = self.vocab_cls(counter, specials=specials)

    def build_vocab(self, *args, **kwargs):
        if hasattr(self, 'vocab'):
            return
        counter = Counter()
        sources = []
        for arg in args:
            if isinstance(arg, Dataset):
                sources += [getattr(arg, name) for name, field in
                            arg.fields.items() if field is self]
            else:
                sources.append(arg)
        for data in sources:
            for x in data:
                try:
                    counter.update(x)
                except TypeError:
                    counter.update(chain.from_iterable(x))
        specials = list(
            OrderedDict.fromkeys(
                tok for tok in [self.unk, self.pad, self.bos, self.eos] + kwargs.pop('specials', []) if tok is not None)
        )
        self.vocab = self.vocab_cls(counter, specials=specials, **kwargs)

    def __setstate__(self, state):
        state['dtype'] = getattr(torch, state['dtype'])
        return super(SequenceField, self).__setstate__(state)

    def __getstate__(self):
        attrs = super(SequenceField, self).__getstate__()
        attrs['dtype'] = dtype_to_attr(self.dtype)
        return attrs

    def pad_batch(self, minibatch: list):
        minibatch = list(minibatch)
        max_len = max(len(x) for x in minibatch)

        if self.max_length is not None:
            max_len = min(max_len, self.max_length + (self.bos, self.eos).count(None) - 2)

        padded, lengths = [], []

        for x in minibatch:
            padded.append(
                ([] if self.bos is None else [self.bos])
                + list(x[:max_len])
                + ([] if self.eos is None else [self.eos])
                + [self.pad] * max(0, max_len - len(x[:max_len]))
            )
            lengths.append(len(padded[-1]) - max(0, max_len - len(x[:max_len])))

        if self.include_lengths:
            return (padded, lengths)
        return padded

    def numericalize(self, arr, device=None):
        if self.include_lengths and not isinstance(arr, tuple):
            raise ValueError("Field has include_lengths set to True, but "
                             "input data is not a tuple of "
                             "(data batch, batch lengths).")
        lengths = None
        if isinstance(arr, tuple):
            arr, lengths = arr

        if self.use_vocab:
            arr = [[self.vocab.stoi[x] for x in ex] for ex in arr]
            if self.postprocessing is not None:
                arr = self.postprocessing(arr, self.vocab)
        else:
            if self.dtype not in self.dtypes:
                raise ValueError(
                    f"Specified Field dtype {self.dtype} can not be used with "
                    "use_vocab=False because we do not know how to numericalize it. "
                )
            numericalization_func = self.dtypes[self.dtype]
            arr = [[numericalization_func(x) for x in ex] for ex in arr]
            if self.postprocessing is not None:
                arr = self.postprocessing(arr, None)

        var = torch.tensor(arr, dtype=self.dtype, device=device)
        var = var.contiguous()
        if self.include_lengths:
            lengths = torch.tensor(lengths, dtype=self.dtype, device=device)
            return var, lengths
        return var

    def preprocess(self, x):
        if self.preprocessing is not None:
            x = self.preprocessing(x)
        return x

    def process(self, batch, device=None):
        padded = self.pad_batch(batch)
        tensor = self.numericalize(padded, device=device)

        if not self.pad_bias:
            return tensor
        if isinstance(tensor, torch.Tensor):
            return tensor - 1
        else:
            tensor, length = tensor
            return tensor - 1, length