symbols = {} for row in df.values: k = row[1] a = row[2].strip() if len(a) > 1: v = (a[:-1], a[-1], row[3].strip(), str(row[4])) else: v = (a, '', row[3].strip(), str(row[4])) symbols[k] = v
利用pandas查看生成结果:
pd.DataFrame.from_dict(symbols, 'index').head(10)
测试数据匹配
好,接下来,我们尝试读取word文档的第一个表格,并匹配获取每个字音需要添加的音韵调:
from docx import Document
doc = Document(r'01老男单字字音对照表(兴义).docx') header_cells = doc.tables[0].rows[0].cells[1:] for cell in header_cells: k = cell.text[5] print(cell.text, symbols[k])
doc = Document(r'01老男单字字音对照表(兴义).docx') header_cells = doc.tables[0].rows[0].cells[1:] row_cells = doc.tables[0].add_row().cells[1:] for header_cell, row_cell in zip(header_cells, row_cells): k = header_cell.text[4] a, b, c, d = symbols[k] (p,) = row_cell.paragraphs p.alignment = WD_ALIGN_PARAGRAPH.CENTER p.style.font.name = 'IPAPANNEW' p.add_run(a) p.add_run(b).font.superscript = True p.add_run(c) p.add_run(d).font.superscript = True doc.save('tmp.docx')
结果:
可以看到已经顺利添加进去,并没有什么问题。
最终测试
现在改下代码,添加所有单元格:
doc = Document(r'01老男单字字音对照表(兴义).docx') for t in doc.tables: header_cells = t.rows[0].cells[1:] row_cells = t.add_row().cells[1:] for header_cell, row_cell in zip(header_cells, row_cells): k = header_cell.text[4] a, b, c, d = symbols[k] (p,) = row_cell.paragraphs p.alignment = WD_ALIGN_PARAGRAPH.CENTER p.style.font.name = 'IPAPANNEW' p.add_run(a) p.add_run(b).font.superscript = True p.add_run(c) p.add_run(d).font.superscript = True doc.save('result.docx')
结果:
可以看到,都顺利添加了对应的字音,但有点不太满意,有部分整行都是空白单元格,应该删除更佳。
增加删除空行的代码:
doc = Document(r'01老男单字字音对照表(兴义).docx') for t in doc.tables: ## 从第四行开始检查并去除表格的空白行 for row in t.rows[3:]: if np.all([cell.text == ''for cell in row.cells]): t._tbl.remove(row._tr) ## 取出第一行从第二个开始所有单元格 header_cells = t.rows[0].cells[1:] ## 取出新增一行从第二个开始所有单元格 row_cells = t.add_row().cells[1:] for header_cell, row_cell in zip(header_cells, row_cells): k = header_cell.text[4] a, b, c, d = symbols[k] (p,) = row_cell.paragraphs p.alignment = WD_ALIGN_PARAGRAPH.CENTER p.style.font.name = 'IPAPANNEW' p.add_run(a) p.add_run(b).font.superscript = True p.add_run(c) p.add_run(d).font.superscript = True doc.save('result.docx')
再次执行,office打开的结果:
可以看到空行已经都顺利的被删除。
完整处理代码
整个过程已经完整测试通过,最终完整处理代码为:
from docx.enum.text import WD_ALIGN_PARAGRAPH from docx import Document import pandas as pd import numpy as np
df = pd.read_excel('老男单字.xls', usecols=[1, 3, 4, 5]) symbols = {} for row in df.values: k = row[0] a = row[1].strip() if len(a) > 1: v = (a[:-1], a[-1], row[2].strip(), str(row[3])) else: v = (a, '', row[2].strip(), str(row[3])) symbols[k] = v
doc = Document(r'01老男单字字音对照表(兴义).docx') for t in doc.tables: ## 从第四行开始检查并去除表格的空白行 for row in t.rows[3:]: if np.all([cell.text == ''for cell in row.cells]): t._tbl.remove(row._tr) ## 取出第一行从第二个开始所有单元格 header_cells = t.rows[0].cells[1:] ## 取出新增一行从第二个开始所有单元格 row_cells = t.add_row().cells[1:] for header_cell, row_cell in zip(header_cells, row_cells): k = header_cell.text[4] a, b, c, d = symbols[k] (p,) = row_cell.paragraphs p.alignment = WD_ALIGN_PARAGRAPH.CENTER p.style.font.name = 'IPAPANNEW' p.add_run(a) p.add_run(b).font.superscript = True p.add_run(c) p.add_run(d).font.superscript = True doc.save('result.docx')
doc = Document(r'02词汇对照表(兴义).docx') i = 0 for t in doc.tables: ## 取出第一行从第二个开始所有单元格 header_cells = t.rows[0].cells[1:] for header_cell in header_cells: k = int(header_cell.text[:4].replace('ʰ', '9'))
未发现任何报错,说明对应编号获取成功,于是就可以通过将键k传入symbols获取需要写入的数据。
最终word生成代码
from docx import Document
doc = Document(r'02词汇对照表(兴义).docx') for t in doc.tables: ## 取出第一行从第二个开始所有单元格 header_cells = t.rows[0].cells[1:] ## 取出新增一行从第二个开始所有单元格 row_cells = t.add_row().cells[1:] for header_cell, row_cell in zip(header_cells, row_cells): k = int(header_cell.text[:4].replace('ʰ', '9')) symbol_dict = symbols[k] ifnot pd.isna(symbol_dict['词2字']): row_cell.add_paragraph() for i, p in enumerate(row_cell.paragraphs, 1): p.style.font.name = 'IPAPANNEW' p.add_run(symbol_dict[f'词{i}字']).font.name = '宋体' for n, e in enumerate(symbol_dict[f'词{i}音']): run = p.add_run(e) if n % 2 == 1: run.font.superscript = True
doc = Document(r'02词汇对照表(兴义).docx') for t in doc.tables: ## 取出第一行从第二个开始所有单元格 header_cells = t.rows[0].cells[1:] ## 取出新增一行从第二个开始所有单元格 row_cells = t.add_row().cells[1:] for header_cell, row_cell in zip(header_cells, row_cells): k = int(header_cell.text[:4].replace('ʰ', '9')) symbol_dict = symbols[k] ifnot pd.isna(symbol_dict['词2字']): row_cell.add_paragraph() for i, p in enumerate(row_cell.paragraphs, 1): p.style.font.name = 'IPAPANNEW' p.add_run(symbol_dict[f'词{i}字']).font.name = '宋体' for n, e in enumerate(symbol_dict[f'词{i}音']): run = p.add_run(e) if n % 2 == 1: run.font.superscript = True