ltp.data.dataset.line 源代码

#! /usr/bin/env python
# -*- coding: utf-8 -*_
# Author: Yunlong Feng <ylfeng@ir.hit.edu.cn>
import os

from tqdm import tqdm

import re
from . import Dataset
from ltp.data.example import Example
from ltp.data.utils import iter_lines


[文档]class LineDataset(Dataset, alias="line"): """ Dataset 注册名: line 每一行一个句子,词语之间按空格分割,下划线分割词语与词性,例子如下:: 上海_NR 浦东_NR 开发_NN 与_CC 法制_NN 建设_NN 同步_VV 或者使用其他字符分割(由split参数控制) 上海/NR 浦东/NR 开发/NN 与/CC 法制/NN 建设/NN 同步/VV """ def __init__(self, path, file, fields, split='\0', maxsplit=0, silent=True, proxy_property=None, line_split=None, line_strip=None, **kwargs): filename = os.path.join(path, file) examples = [] split_regex = re.compile(split) fields_slices = [idx for idx, field in enumerate(fields) if field is not None] if proxy_property is not None: field_map = {field[0]: idx for idx, field in enumerate(fields)} for proxy, source in proxy_property.items(): fields_slices[field_map[proxy]] = fields_slices[field_map[source]] for line_num, line in tqdm(list(iter_lines(filename, line_split, line_strip))): if len(line) == 0: continue data = [list(item) for item in zip(*(split_regex.split(item, maxsplit) for item in line))] data = [data[field_slice] for field_slice in fields_slices] try: examples.append(Example.fromlist(data, fields)) except Exception as e: if not silent: print(line_num, e, line) super(LineDataset, self).__init__(examples, fields, **kwargs)
from ltp.utils import deprecated @deprecated(info='CTB数据集已经被line代替') def CTBDataset(path, fields, silent=True, proxy_property=None, split=None, strip=None, **kwargs): return LineDataset(path=path, fields=fields, split='\0', silent=silent, proxy_property=proxy_property, line_split=split, line_strip=strip, **kwargs) Dataset.weak_register('CTB', CTBDataset)