分享

Python处理Excel工作表

 乙甲壬 2020-07-05

Excel文件检查

from xlrd import open_workbookinput_file='sales_2013.xlsx'workbook=open_workbook(input_file)print('Number of worksheets',workbook.nsheets)for worksheet in workbook.sheets(): print('sheet name:',worksheet.name,'\trows:',worksheet.nrows,'\tcolumns:',worksheet.ncols)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7

读写Excel文件

from xlrd import open_workbookfrom xlwt import Workbookinput_file='sales_2013.xlsx'output_file='excel_copy.xlsx'output_workbook=Workbook()output_worksheet=output_workbook.add_sheet('jan_2013_output')with open_workbook(input_file) as workbook:    worksheet=workbook.sheet_by_name('january_2013')    for row_index in range(worksheet.nrows):        for column_index in range(worksheet.ncols):            output_worksheet.write(row_index,column_index,worksheet.cell_value(row_index,column_index))output_workbook.save(output_file)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12

格式化Excel文件中的数据

from datetime import datefrom xlrd import open_workbook,xldate_as_tuplefrom xlwt import Workbookinput_file='sales_2013.xlsx'output_file='date_format.xls'output_workbook=Workbook()output_worksheet=output_workbook.add_sheet('jan_2013_output')with open_workbook(input_file) as workbook: worksheet=workbook.sheet_by_name('january_2013') for row_index in range(worksheet.nrows): row_list_output=[] for col_index in range(worksheet.ncols): if worksheet.cell_type(row_index,col_index) ==3: date_cell=xldate_as_tuple(worksheet.cell_value(row_index,col_index),workbook.datemode) date_cell=date(*date_cell[0:3]).strftime('%m%d%Y') row_list_output.append(date_cell) output_worksheet.write(row_index,col_index,date_cell) else: non_date_cell=worksheet.cell_value(row_index,col_index) row_list_output.append(non_date_cell) output_worksheet.write(row_index,col_index,non_date_cell) print(row_list_output)output_workbook.save(output_file)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23

Excel行中的值满足某个条件

import sysfrom datetime import datefrom xlrd import open_workbook,xldate_as_tuplefrom xlwt import Workbookinput_file='sales_2013.xlsx'output_file='g.xls'output_workbook=Workbook()output_worksheet=output_workbook.add_sheet('jan_2013_output')sale_amount_column_index=3with open_workbook(input_file) as workbook:    worksheet=workbook.sheet_by_name('january_2013')    data=[]    header=worksheet.row_values(0)    data.append(header)    for row_index in range(1,worksheet.nrows):        row_list=[]        sale_amount=worksheet.cell_value(row_index,sale_amount_column_index)        if sale_amount >1400.0:            for column_index in range(worksheet.ncols):                cell_value=worksheet.cell_value(row_index,column_index)                cell_type=worksheet.cell_type(row_index,column_index)                if cell_type ==3:                    date_cell=xldate_as_tuple(cell_value,workbook.datemode)                    date_cell=date(*date_cell[0:3]).strftime('%m%d%Y')                    row_list.append(date_cell)                else:                    row_list.append(cell_value)            if row_list:                data.append(row_list)    for list_index,output_list in enumerate(data):        for element_index,element in enumerate(output_list):            output_worksheet.write(list_index,element_index,element)output_workbook.save(output_file)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33

pandas版本

import pandas as pdimport sysinput_file='sales_2013.xlsx'output_file='gp.xls'data_frame=pd.read_excel(input_file,'january_2013',index_col=None)data_frame_value_meets_condition=data_frame[data_frame['Sale Amount'].astype(float) > 1400.0]writer=pd.ExcelWriter(output_file)data_frame_value_meets_condition.to_excel(writer,sheet_name='jan_13_output',index=False)writer.save()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9

Excel行中的值属于某个集合

import pandas as pdimport sysinput_file='sales_2013.xlsx'output_file='value_in_set.xls'data_frame=pd.read_excel(input_file,'january_2013',index_col=None)inportant_dates=['01/24/2013','01/31/2013']data_frame_value_in_set=data_frame[data_frame['Purchase Date'].isin(inportant_dates)]writer=pd.ExcelWriter(output_file)data_frame_value_in_set.to_excel(writer,sheet_name='jan_2013_output',index=False)writer.save()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10

Excel行中的值匹配某个表达式

import pandas as pdinput_file='sales_2013.xlsx'output_file='pandas_row_value_matches_re.xls'data_frame=pd.read_excel(input_file,'january_2013',index_col=None)data_frame_value_matches_pattern=data_frame[data_frame['Customer Name'].str.match('J')]#或者data_frame_value_matches_pattern=data_frame[data_frame['Customer Name'].str.startswith('J')]writer=pd.ExcelWriter(output_file)data_frame_value_matches_pattern.to_excel(writer,sheet_name='jan_13_output',index=False)writer.save()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9

Excel基于索引选取特定的列

import pandas as pdinput_file='sales_2013.xlsx'output_file='value_in_column.xls'data_frame=pd.read_excel(input_file,'january_2013',index_col=None)data_frame_by_index=data_frame.iloc[:,[1,4]]writer=pd.ExcelWriter(output_file)data_frame_by_index.to_excel(writer,sheet_name='jan_13',index=False)writer.save()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8

Excel基于列标题选取

import pandas as pdinput_file='sales_2013.xlsx'output_file='pandas_column_title_select.xls'data_frame=pd.read_excel(input_file,'january_2013',index_col=None)data_frame_column_by_name=data_frame.loc[:,['Customer ID','Purchase Date']]writer=pd.ExcelWriter(output_file)data_frame_column_by_name.to_excel(writer,sheet_name='jan_13_output',index=False)writer.save()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8

工作表计数以及每个工作表中的行列计数

import globimport osimport sysfrom xlrd import open_workbookinput_directory = sys.argv[1]workbook_counter = 0for input_file in glob.glob(os.path.join(input_directory, '*.xls*')):    workbook = open_workbook(input_file)    print('Workbook: {}'.format(os.path.basename(input_file)))    print('Number of worksheets: {}'.format(workbook.nsheets))    for worksheet in workbook.sheets():        print('Worksheet name:', worksheet.name, '\tRows:',\                worksheet.nrows, '\tColumns:', worksheet.ncols)    workbook_counter += 1print('Number of Excel workbooks: {}'.format(workbook_counter))
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15

在一个Excel文件所有工作表中选取特定行

import pandas as pdinput_file='sales_2013.xlsx'output_file='all_sheeet_column_title.xls'data_frame=pd.read_excel(input_file,sheet_name=None,index_col=None)row_output=[]for worksheet_name , data in data_frame.items(): row_output.append(data[data['Sale Amount'].astype(float) >2279.0])filtered_rows=pd.concat(row_output,axis=0,ignore_index=True)writer=pd.ExcelWriter(output_file)filtered_rows.to_excel(writer,sheet_name='sale_amount_gt200',index=False)writer.save()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11

在一个Excel所有工作表中选取特定列

import pandas as pdimport sysinput_file='sales_2013.xlsx'output_file='pandas_column_by_name_all_worksheets.xls'data_frame = pd.read_excel(input_file, sheet_name=None, index_col=None)column_output = []for worksheet_name, data in data_frame.items():    column_output.append(data.loc[:, ['Customer Name', 'Sale Amount']])selected_columns = pd.concat(column_output, axis=0, ignore_index=True)writer = pd.ExcelWriter(output_file)selected_columns.to_excel(writer, sheet_name='selected_columns_all_worksheets', index=False)writer.save()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13

在Excel工作簿中读取一组工作表
也就是说只读取某些工作表
下面包含两种筛选方式:1 索引筛选 2 根据工作表名字筛选

import pandas as pdimport sysinput_file='sales_2013.xlsx'output_file='pandas_value_meets_condition_set_of_.xls'my_sheets = ['january_2013','february_2013'] #按照名字#my_sheets = [0,1] 按照索引threshold = 1900.0data_frame = pd.read_excel(input_file, sheet_name=my_sheets, index_col=None)row_list = []for worksheet_name, data in data_frame.items(): row_list.append(data[data['Sale Amount'].replace('$', '').replace(',', '').astype(float) > threshold])filtered_rows = pd.concat(row_list, axis=0, ignore_index=True)writer = pd.ExcelWriter(output_file)filtered_rows.to_excel(writer, sheet_name='set_of_worksheets', index=False)writer.save()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15

将多个工作簿中的多个工作表整合到一个Excel中

import pandas as pdimport globimport osimport sysinput_path='./'output_file='all_data_all_workbooks.xls'all_workbooks = glob.glob(os.path.join(input_path,'*.xls*'))data_frames = []for workbook in all_workbooks:    all_worksheets = pd.read_excel(workbook, sheet_name=None, index_col=None)    for worksheet_name, data in all_worksheets.items():        data_frames.append(data)all_data_concatenated = pd.concat(data_frames, axis=0, ignore_index=True)writer = pd.ExcelWriter(output_file)all_data_concatenated.to_excel(writer, sheet_name='all_data_all_workbooks', index=False)writer.save()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16

为每个工作簿和工作表计算总数和平均值

import pandas as pdimport globimport osinput_path = './'output_file = 'pandas_sum_average_multiple_workbooks.xls'all_workbooks = glob.glob(os.path.join(input_path, '*.xls*'))data_frames = []for workbook in all_workbooks: all_worksheets = pd.read_excel(workbook, sheet_name=None, index_col=None) workbook_total_sales = [] workbook_number_of_sales = [] worksheet_data_frames = [] worksheets_data_frame = None workbook_data_frame = None for worksheet_name, data in all_worksheets.items(): total_sales = pd.DataFrame( [float(str(value).strip('$').replace(',', '')) for value in data.ix[:, 'Sale Amount']]).sum() number_of_sales = len(data.loc[:, 'Sale Amount']) average_sales = pd.DataFrame(total_sales / number_of_sales) workbook_total_sales.append(total_sales) workbook_number_of_sales.append(number_of_sales) data = {'workbook': os.path.basename(workbook), 'worksheet': worksheet_name, 'worksheet_total': total_sales, 'worksheet_average': average_sales} worksheet_data_frames.append( pd.DataFrame(data, columns=['workbook', 'worksheet', 'worksheet_total', 'worksheet_average'])) worksheets_data_frame = pd.concat(worksheet_data_frames, axis=0, ignore_index=True) workbook_total = pd.DataFrame(workbook_total_sales).sum() workbook_total_number_of_sales = pd.DataFrame(workbook_number_of_sales).sum() workbook_average = pd.DataFrame(workbook_total / workbook_total_number_of_sales) workbook_stats = {'workbook': os.path.basename(workbook), 'workbook_total': workbook_total, 'workbook_average': workbook_average} workbook_stats = pd.DataFrame(workbook_stats, columns=['workbook', 'workbook_total', 'workbook_average']) #workbook_data_frame == worksheets_data_frame + workbook_stats workbook_data_frame = pd.merge(worksheets_data_frame, workbook_stats, on='workbook', how='left') data_frames.append(workbook_data_frame)all_data_concatenated = pd.concat(data_frames, axis=0, ignore_index=True)writer = pd.ExcelWriter(output_file)all_data_concatenated.to_excel(writer, sheet_name='sums_and_averages', index=False)writer.save()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44

    本站是提供个人知识管理的网络存储空间,所有内容均由用户发布,不代表本站观点。请注意甄别内容中的联系方式、诱导购买等信息,谨防诈骗。如发现有害或侵权内容,请点击一键举报。
    转藏 分享 献花(0

    0条评论

    发表

    请遵守用户 评论公约