深度学习同传统机器学习一样,数据准备是一项耗时的工作,且数据准备没有一定之规。如果你去学习tensorflow,会发现五花八门的手法,看似给你最大的自由度,其实是无形中加重没必要的学习难度。 pytorch当然也没有限制,但是它提供了很好的DataSet和DataLoader,简单继承与实现。 import torch from torch.utils.data import Dataset,DataLoader
class RandomDataset(Dataset):
def __init__(self, size, length): self.len = length self.data = torch.randn(length, size)
def __getitem__(self, index): return self.data[index]
def __len__(self): return self.len 只要自己实现__getitem__和__len__即可。 from ainlp.corpus import dummy_dataset from torch.utils.data import DataLoader import unittest
class TestDataSet(unittest.TestCase): def setUp(self): pass
def test_dummy_dataset(self): #100行,5列的随机数 rand_loader = DataLoader(dataset=dummy_dataset.RandomDataset(size=5, length=100), batch_size=30, shuffle=True)
#100行5列,batch=30,可以取4次 for i_batch, sample_batched in enumerate(rand_loader): print(i_batch, sample_batched.size()) print(sample_batched) 一个epoch循环,读取4次,每batch=30。 0 torch.Size([30, 5]) 1 torch.Size([30, 5]) 2 torch.Size([30, 5]) 3 torch.Size([10, 5]) 我们使用这个格式,将CoNLL-2002数据加以封装。 import torch import nltk import random from . import utils
from torch.utils.data import Dataset,DataLoader
class CoNLL2002Dataset(Dataset):
def __init__(self, b_train=True): self.b_train = b_train corpus = nltk.corpus.conll2002.iob_sents()
WINDOW_SIZE = 2 windows = [] dummy = ['<DUMMY>'] * WINDOW_SIZE
all_words = [] all_tags = []
for i,cor in enumerate(corpus): #cor是一句话的序列标注结果[(word1,POS,NER),(),()] data = [] words,POS,NER = list(zip(*cor))
all_words.extend(words) all_tags.extend(NER)
#词序变成一个2-gram的窗口 window = list(nltk.ngrams(dummy + list(words) + dummy, WINDOW_SIZE * 2 + 1)) windows.extend([[list(window[i]), NER[i]] for i in range(len(words))])
#random.shuffle(windows) self.train_data = windows[:int(len(windows) * 0.9)] self.test_data = windows[int(len(windows) * 0.9):]
self.word2idx, self.idx2word = utils.word_to_index(all_words) self.tag2idx, self.idx2tag = utils.tag_to_index(all_tags)
def get_batch(self,batch_size,is_train=True): data = self.train_data if is_train else self.test_data return utils.get_batch(batch_size, data)
def batch_to_variable(self,data): # 这里是对应好的一个个window,和句子无关了 x, y = list(zip(*data)) inputs = torch.cat([utils.prepare_sequence(sent, self.word2idx).view(1, -1) for sent in x]) targets = torch.cat([utils.prepare_tag(tag, self.tag2idx) for tag in y])
return inputs,targets 是一个细致活,比较容易出错。 后面建模就相对容易了。 import torch.nn as nn
class WindowClassifier(nn.Module): def __init__(self, vocab_size, embedding_size, window_size, hidden_size, output_size):
super(WindowClassifier, self).__init__()
self.embed = nn.Embedding(vocab_size, embedding_size) self.h_layer1 = nn.Linear(embedding_size * (window_size * 2 + 1), hidden_size) self.h_layer2 = nn.Linear(hidden_size, hidden_size) self.o_layer = nn.Linear(hidden_size, output_size) self.relu = nn.ReLU() self.softmax = nn.LogSoftmax(dim=1) self.dropout = nn.Dropout(0.3)
def forward(self, inputs, is_training=False): embeds = self.embed(inputs) # BxWxD concated = embeds.view(-1, embeds.size(1) * embeds.size(2)) # Bx(W*D) h0 = self.relu(self.h_layer1(concated)) if is_training: h0 = self.dropout(h0) h1 = self.relu(self.h_layer2(h0)) if is_training: h1 = self.dropout(h1) out = self.softmax(self.o_layer(h1)) return out 关于作者:魏佳斌,互联网产品/技术总监,北京大学光华管理学院(MBA),特许金融分析师(CFA),资深产品经理/码农。偏爱python,深度关注互联网趋势,人工智能,AI金融量化。致力于使用最前沿的认知技术去理解这个复杂的世界。
|