Kaggle从零到实践：Bert中文多项选择

520jefferson 2021-07-14

展开全文

机器阅读理解最大的挑战就是回答需要外部先验知识的问题，文本将使用Bert模型来完成C3（中文多项选择题）。

C3数据集一共有13369篇文章和19577个问题，其中的60%用是训练集，20%是开发集，20%是测试集。

步骤1：查看数据样例

C3数据集的案例如下，模型需要对对话和问题进行理解，最后从待选选项中识别出正确的答案。

对话：
男：你今天晚上有时间吗?我们一起去看电影吧?
女：你喜欢恐怖片和爱情片，但是我喜欢喜剧片，科幻片一般。

问题：女的最喜欢哪种电影?
可选择项：['恐怖片', '爱情片', '喜剧片', '科幻片']
正确答案：喜剧片

步骤2：定义数据读取格式

接下来完成具体的数据读取格式转换，首先读取Bert Tokenizer。

import torch
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese', num_choices=4)

然后定义具体的batch数据处理，需要将问题和待选项进行处理。

def collate_fn(data): #将文章问题选项拼在一起后，得到分词后的数字id，输出的size是(batch, n_choices, max_len)
    input_ids, attention_mask, token_type_ids = [], [], []
    for x in data:
        text = tokenizer(x[1], text_pair=x[0], padding='max_length', truncation=True, 
                         max_length=128, return_tensors='pt')
        input_ids.append(text['input_ids'].tolist())
        attention_mask.append(text['attention_mask'].tolist())
        token_type_ids.append(text['token_type_ids'].tolist())
    input_ids = torch.tensor(input_ids)
    attention_mask = torch.tensor(attention_mask)
    token_type_ids = torch.tensor(token_type_ids)
    label = torch.tensor([x[-1] for x in data])
    return input_ids, attention_mask, token_type_ids, label

最后定义Dataset，需要将多选项转为单个选择和问题的匹配过程。

import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset

class TextDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels
        
    def __getitem__(self, idx):
        label = self.labels[idx]
        question = self.data[idx][1][0]['question']
        content = '。'.join(self.data[idx][0])
        choice = self.data[idx][1][0]['choice']
        if len(choice) < 4: #如果选项不满四个，就补“不知道”
            for i in range(4-len(choice)):
                choice.append('不知道')
        
        content = [content for i in range(len(choice))]
        pair = [question + ' ' + i for i in choice]
        
        return content, pair, label
    
    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(train, train_label)
test_dataset = TextDataset(val, val_label)

步骤3：定义Bert模型

这里可以直接使用BertForMultipleChoice来完成Finetune过程。

import torch
from transformers import BertForMultipleChoice, AdamW, get_linear_schedule_with_warmup
model = BertForMultipleChoice.from_pretrained('bert-base-chinese')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

步骤4：模型训练与验证

当定义好数据集、模型后，接下来是万年不变的模型正向传播和反向传播代码。

from tqdm import tqdm

def train():
    model.train()
    total_train_loss = 0
    iter_num = 0
    total_iter = len(train_loader)
    for idx, (input_ids, attention_mask, token_type_ids, labels) in enumerate(train_loader):
        optim.zero_grad()
        
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        loss = outputs.loss
        
        if idx % 20 == 0:
            with torch.no_grad():
                print((outputs[1].argmax(1).data == labels.data).float().mean().item(), loss.item())
        
        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optim.step()
        scheduler.step()

        iter_num += 1
        if(iter_num % 100 ==0):
            print('epoth: %d, iter_num: %d, loss: %.4f, %.2f%%' % (epoch, iter_num, loss.item(), iter_num/total_iter*100))
        
    print('Epoch: %d, Average training loss: %.4f'%(epoch, total_train_loss/len(train_loader)))
    
def validation():
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    for (input_ids, attention_mask, token_type_ids, labels) in test_dataloader:
        with torch.no_grad():
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs[1]

        total_eval_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()
        total_eval_accuracy += (outputs[1].argmax(1).data == labels.data).float().mean().item()
        
    avg_val_accuracy = total_eval_accuracy / len(test_dataloader)
    print('Accuracy: %.4f' % (avg_val_accuracy))
    print('Average testing loss: %.4f'%(total_eval_loss/len(test_dataloader)))
    print('-------------------------------')

for epoch in range(4):
    print('------------Epoch: %d ----------------' % epoch)
    validation()
    train()