# -*- coding: utf-8 -*-
from __future__ import print_function
import codecs
# 用codecs提供的open方法来指定打开的文件的语言编码,它会在读取的时候自动转换为内部unicode coder=codecs.open('/home/dms/PycharmProjects/transformer/preprocessed/de.vocab.tsv', 'r', 'utf-8') print (type(coder))#<type 'instance'>
reader=coder.read() print (type(reader))#<type 'unicode'>
vocablist=reader.splitlines() print (type(vocablist))
vv=[] for line in vocablist: #line.split() u'<PAD>' u'1000000000' #line为读取一行的内容,然后split,分列 if int(line.split()[1]) >= 20: vv.append(line.split()[0])
#上面的好几行代码,等同下面的一句 vocab = [line.split()[0] for line in codecs.open('/home/dms/PycharmProjects/transformer/preprocessed/de.vocab.tsv', 'r', 'utf-8').read().splitlines() if int(line.split()[1])>=20]
if vv==vocab: print ("相等")
文件路径:
de.vocab.tsv文件内容:
|