分享

从零开始学Python数据分析:详解Pandas 基础

 wenxuefeng360 2022-07-17 发布于四川
话不多说,直接上代码,全部可在jupyter notebook 里运行。from pandas import Series,DataFrameimport pandas as pd
obj = Series([1, -2, 3, -4])obj
0 1
1 -2
2 3
3 -4
dtype: int64
obj2 = Series([1, -2, 3, -4], index=['a', 'b', 'c', 'd'])obj2
a 1
b -2
c 3
d -4
dtype: int64
obj2.values
array([ 1, -2, 3, -4], dtype=int64)
obj2.index
Index(['a', 'b', 'c', 'd'], dtype='object')
obj2['b']
-2
obj2['c'] = 23obj2[['c', 'd']]
c 23
d -4
dtype: int64
obj2
a 1
b -2
c 23
d -4
dtype: int64
obj2[obj2 < 0 ]
b -2
d -4
dtype: int64
obj2 * 2
a 2
b -4
c 46
d -8
dtype: int64
import numpy as np
np.abs(obj2)
a 1
b 2
c 23
d 4
dtype: int64
data = {
'张三':92,
'李四':78,
'王五':68,
'小明':82 }
obj3 = Series(data)obj3
小明 82
张三 92
李四 78
王五 68
dtype: int64
names = ['张三', '李四', '王五', '小明']obj4 = Series(data, index=names)obj4
张三 92
李四 78
王五 68
小明 82
dtype: int64
obj4.name = 'math'obj4.index.name = 'students'
obj4
students
张三 92
李四 78
王五 68
小明 82
Name: math, dtype: int64

dataframe

import numpy as npfrom pandas import Series,DataFrameimport pandas as pd
data = {
'name':['张三', '李四', '王五', '小明'],
'sex':['female', 'female', 'male', 'male'],
'year':[2001, 2001, 2003, 2002],
'city':['北京', '上海', '广州', '北京']}df = DataFrame(data)df

city

name

sex

year

0

北京

张三

female

2001

1

上海

李四

female

2001

2

广州

王五

male

2003

3

北京

小明

male

2002

df = DataFrame(data, columns=['name', 'sex', 'year', 'city'])df

name

sex

year

city

0

张三

female

2001

北京

1

李四

female

2001

上海

2

王五

male

2003

广州

3

小明

male

2002

北京

df = DataFrame(data, columns=['name', 'sex', 'year', 'city'],index=['a', 'b', 'c', 'd'])df

name

sex

year

city

a

张三

female

2001

北京

b

李四

female

2001

上海

c

王五

male

2003

广州

d

小明

male

2002

北京

df.index
Index(['a', 'b', 'c', 'd'], dtype='object')
df.columns
Index(['name', 'sex', 'year', 'city'], dtype='object')
data2 = {
'sex':{'张三':'female','李四':'female','王五':'male'},
'city':{'张三':'北京','李四':'上海','王五':'广州'}}df2 = DataFrame(data2)df2

city

sex

张三

北京

female

李四

上海

female

王五

广州

male

df.index.name = 'id'df.columns.name = 'std_info'
df

std_info

name

sex

year

city

id





a

张三

female

2001

北京

b

李四

female

2001

上海

c

王五

male

2003

广州

d

小明

male

2002

北京

obj = Series([1, -2, 3, -4], index=['a', 'b', 'c', 'd'])obj
a 1
b -2
c 3
d -4
dtype: int64
obj.index
Index(['a', 'b', 'c', 'd'], dtype='object')
df.index
Index(['a', 'b', 'c', 'd'], dtype='object', name='id')
df.columns
Index(['name', 'sex', 'year', 'city'], dtype='object', name='std_info')
index = obj.index
index
[1] = 'f'
---------------------------------------------------------------------------

TypeError Traceback (most recent call last)

<ipython-input-14-4f995da5e969> in <module>()
1 index = obj.index
----> 2 index[1] = 'f'


F:\Anaconda\envs\data-analysis\lib\site-packages\pandas\core\indexes\base.py in __setitem__(self, key, value)
1668
1669 def __setitem__(self, key, value):
-> 1670 raise TypeError("Index does not support mutable operations")
1671
1672 def __getitem__(self, key):


TypeError: Index does not support mutable operations
df

std_info

name

sex

year

city

id





a

张三

female

2001

北京

b

李四

female

2001

上海

c

王五

male

2003

广州

d

小明

male

2002

北京

'sex' in df.columns
True
'f' in df.index
False
obj = Series([1, -2, 3, -4], index=['b', 'a', 'c', 'd'])obj
b 1
a -2
c 3
d -4
dtype: int64
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])obj2
a -2.0
b 1.0
c 3.0
d -4.0
e NaN
dtype: float64
obj = Series([1, -2, 3, -4], index=[0,2,3,5])obj
0 1
2 -2
3 3
5 -4
dtype: int64
obj2 = obj.reindex(range(6),method='ffill')obj2
0 1
1 1
2 -2
3 3
4 3
5 -4
dtype: int64
df = DataFrame(np.arange(9).reshape(3,3),index=['a','c','d'],columns=['name','id','sex'])df

name

id

sex

a

0

1

2

c

3

4

5

d

6

7

8

df2 = df.reindex(['a', 'b', 'c', 'd'])df2

name

id

sex

a

0.0

1.0

2.0

b

NaN

NaN

NaN

c

3.0

4.0

5.0

d

6.0

7.0

8.0

df3 = df.reindex(columns=['name', 'year', 'id'], fill_value=0)df3

name

year

id

a

0

0

1

c

3

0

4

d

6

0

7

data = {
'name':['张三', '李四', '王五', '小明'],
'grade':[68, 78, 63, 92]}df = DataFrame(data)df

grade

name

0

68

张三

1

78

李四

2

63

王五

3

92

小明

df2 = df.sort_values(by='grade')df2

grade

name

2

63

王五

0

68

张三

1

78

李四

3

92

小明

df3 = df2.reset_index()df3

index

grade

name

0

2

63

王五

1

0

68

张三

2

1

78

李四

3

3

92

小明

df4 = df2.reset_index(drop=True)df4

grade

name

0

63

王五

1

68

张三

2

78

李四

3

92

小明

data = {
'name':['张三', '李四', '王五', '小明'],
'sex':['female', 'female', 'male', 'male'],
'year':[2001, 2001, 2003, 2002],
'city':['北京', '上海', '广州', '北京']}df = DataFrame(data)df

city

name

sex

year

0

北京

张三

female

2001

1

上海

李四

female

2001

2

广州

王五

male

2003

3

北京

小明

male

2002

df2 = df.set_index('name')df2

city

sex

year

name




张三

北京

female

2001

李四

上海

female

2001

王五

广州

male

2003

小明

北京

male

2002

df3 = df2.reset_index()df3

name

city

sex

year

0

张三

北京

female

2001

1

李四

上海

female

2001

2

王五

广州

male

2003

3

小明

北京

male

2002

索引和选取

import numpy as npfrom pandas import Series,DataFrameimport pandas as pd
obj = Series([1, -2, 3, -4], index=['a', 'b', 'c', 'd'])obj
a 1
b -2
c 3
d -4
dtype: int64
obj[1]
-2
obj['b']
-2
obj[['a','c']]
a 1
c 3
dtype: int64
obj[0:2]
a 1
b -2
dtype: int64
obj['a':'c']
a 1
b -2
c 3
dtype: int64
data = {
'name':['张三', '李四', '王五', '小明'],
'sex':['female', 'female', 'male', 'male'],
'year':[2001, 2001, 2003, 2002],
'city':['北京', '上海', '广州', '北京']}df = DataFrame(data)df

city

name

sex

year

0

北京

张三

female

2001

1

上海

李四

female

2001

2

广州

王五

male

2003

3

北京

小明

male

2002

df['city']
0 北京
1 上海
2 广州
3 北京
Name: city, dtype: object
df.name
0 张三
1 李四
2 王五
3 小明
Name: name, dtype: object
df[['city','sex']]

city

sex

0

北京

female

1

上海

female

2

广州

male

3

北京

male

df2 = df.set_index('name')df2

city

sex

year

name




张三

北京

female

2001

李四

上海

female

2001

王五

广州

male

2003

小明

北京

male

2002

df2[0:2]

city

sex

year

name




张三

北京

female

2001

李四

上海

female

2001

df2['李四':'王五']

city

sex

year

name




李四

上海

female

2001

王五

广州

male

2003

df2

city

sex

year

name




张三

北京

female

2001

李四

上海

female

2001

王五

广州

male

2003

小明

北京

male

2002

df2.loc['张三']
city 北京
sex female
year 2001
Name: 张三, dtype: object
df2.loc[['张三','王五']]

city

sex

year

name




张三

北京

female

2001

王五

广州

male

2003

df2.iloc[1]
city 上海
sex female
year 2001
Name: 李四, dtype: object
df2.iloc[[1,3]]

city

sex

year

name




李四

上海

female

2001

小明

北京

male

2002

df2.ix[['张三','王五'],0:2]

city

sex

name



张三

北京

female

王五

广州

male

pd.set_option('mode.chained_assignment',None)
df2.ix[:,['sex','year']] #获取列

sex

year

name



张三

female

2001

李四

female

2001

王五

male

2003

小明

male

2002

df2.ix[[1,3],:] #获取行

city

sex

year

name




李四

上海

female

2001

小明

北京

male

2002

df2['sex'] == 'female'
name
张三 True
李四 True
王五 False
小明 False
Name: sex, dtype: bool
df2[df2['sex'] == 'female']

city

sex

year

name




张三

北京

female

2001

李四

上海

female

2001

df2[(df2['sex'] == 'female') & (df2['city'] == '北京')]

city

sex

year

name




张三

北京

female

2001

行和列的操作

df

city

name

sex

year

0

北京

张三

female

2001

1

上海

李四

female

2001

2

广州

王五

male

2003

3

北京

小明

male

2002

new_data = {
'city':'武汉',
'name':'小李',
'sex':'male',
'year':2002}
df = df.append(new_data,ignore_index=True) #忽略索引值df

city

name

sex

year

0

北京

张三

female

2001

1

上海

李四

female

2001

2

广州

王五

male

2003

3

北京

小明

male

2002

4

武汉

小李

male

2002

df['class'] = 2018df

city

name

sex

year

class

0

北京

张三

female

2001

2018

1

上海

李四

female

2001

2018

2

广州

王五

male

2003

2018

3

北京

小明

male

2002

2018

4

武汉

小李

male

2002

2018

df['math'] = [92,78,58,69,82]df

city

name

sex

year

class

math

0

北京

张三

female

2001

2018

92

1

上海

李四

female

2001

2018

78

2

广州

王五

male

2003

2018

58

3

北京

小明

male

2002

2018

69

4

武汉

小李

male

2002

2018

82

new_df = df.drop(2) #删除行new_df

city

name

sex

year

class

math

0

北京

张三

female

2001

2018

92

1

上海

李四

female

2001

2018

78

3

北京

小明

male

2002

2018

69

4

武汉

小李

male

2002

2018

82

new_df = new_df.drop('class',axis=1) #删除列new_df

city

name

sex

year

math

0

北京

张三

female

2001

92

1

上海

李四

female

2001

78

3

北京

小明

male

2002

69

4

武汉

小李

male

2002

82

new_df.rename(index={3:2,4:3},columns={'math':'Math'},inplace=True) #inplace可在原数据上修改new_df

city

name

sex

year

Math

0

北京

张三

female

2001

92

1

上海

李四

female

2001

78

2

北京

小明

male

2002

69

3

武汉

小李

male

2002

82

obj1 = Series([3.2,5.3,-4.4,-3.7],index=['a','c','g','f'])obj1
a 3.2
c 5.3
g -4.4
f -3.7
dtype: float64
obj2 = Series([5.0,-2,4.4,3.4],index=['a','b','c','d'])obj2
a 5.0
b -2.0
c 4.4
d 3.4
dtype: float64
obj1 + obj2
a 8.2
b NaN
c 9.7
d NaN
f NaN
g NaN
dtype: float64
df1 = DataFrame(np.arange(9).reshape(3,3),columns=['a','b','c'], index=['apple','tea','banana'])df1

a

b

c

apple

0

1

2

tea

3

4

5

banana

6

7

8

df2 = DataFrame(np.arange(9).reshape(3,3),columns=['a','b','d'], index=['apple','tea','coco'])df2

a

b

d

apple

0

1

2

tea

3

4

5

coco

6

7

8

df1 + df2

a

b

c

d

apple

0.0

2.0

NaN

NaN

banana

NaN

NaN

NaN

NaN

coco

NaN

NaN

NaN

NaN

tea

6.0

8.0

NaN

NaN

df1

a

b

c

apple

0

1

2

tea

3

4

5

banana

6

7

8

s = df1.ix['apple']s
a 0
b 1
c 2
Name: apple, dtype: int32
df1 - s

a

b

c

apple

0

0

0

tea

3

3

3

banana

6

6

6

data = {
'fruit':['apple', 'orange', 'grape', 'banana'],
'price':['25元', '42元', '35元', '14元']}df1 = DataFrame(data)df1

fruit

price

0

apple

25元

1

orange

42元

2

grape

35元

3

banana

14元

def f(x):
return x.split('元')[0]df1['price'] = df1['price'].map(f)df1

fruit

price

0

apple

25

1

orange

42

2

grape

35

3

banana

14

df2 = DataFrame(np.random.randn(3,3),columns=['a','b','c'],index=['app','win','mac'])df2

a

b

c

app

1.507962

-2.140018

0.053571

win

0.729671

0.207060

0.397773

mac

-0.191497

-0.765726

-0.266327

f = lambda x:x.max()-x.min()df2.apply(f)
a 1.699460
b 2.347079
c 0.664100
dtype: float64
df2

a

b

c

app

1.507962

-2.140018

0.053571

win

0.729671

0.207060

0.397773

mac

-0.191497

-0.765726

-0.266327

df2.applymap(lambda x:'%.2f'%x)

a

b

c

app

1.51

-2.14

0.05

win

0.73

0.21

0.40

mac

-0.19

-0.77

-0.27

obj1 = Series([-2,3,2,1],index=['b','a','d','c'])obj1
b -2
a 3
d 2
c 1
dtype: int64
obj1.sort_index() #升序
a 3
b -2
c 1
d 2
dtype: int64
obj1.sort_index(ascending=False) #降序
d 2
c 1
b -2
a 3
dtype: int64
obj1.sort_values()
b -2
c 1
d 2
a 3
dtype: int64
df2

a

b

c

app

1.507962

-2.140018

0.053571

win

0.729671

0.207060

0.397773

mac

-0.191497

-0.765726

-0.266327

df2.sort_values(by='b')

a

b

c

app

1.507962

-2.140018

0.053571

mac

-0.191497

-0.765726

-0.266327

win

0.729671

0.207060

0.397773

df = DataFrame(np.random.randn(9).reshape(3,3),columns=['a','b','c'])df

a

b

c

0

0.660215

-1.137716

-0.302954

1

1.496589

-0.768645

-2.091506

2

0.170316

-2.682284

-0.041099

df.sum()
a 2.327120
b -4.588645
c -2.435558
dtype: float64
df.sum(axis=1)
0 -0.780455
1 -1.363562
2 -2.553067
dtype: float64
data = {
'name':['张三', '李四', '王五', '小明'],
'sex':['female', 'female', 'male', 'male'],
'math':[78, 79, 83, 92],
'city':['北京', '上海', '广州', '北京']}df = DataFrame(data)df

city

math

name

sex

0

北京

78

张三

female

1

上海

79

李四

female

2

广州

83

王五

male

3

北京

92

小明

male

df.describe()

math

count

4.000000

mean

83.000000

std

6.377042

min

78.000000

25%

78.750000

50%

81.000000

75%

85.250000

max

92.000000

obj = Series(['a','b','a','c','b'])obj
0 a
1 b
2 a
3 c
4 b
dtype: object
obj.unique()
array(['a', 'b', 'c'], dtype=object)
obj.value_counts()
a 2
b 2
c 1
dtype: int64
obj = Series(np.random.randn(9),
index
=[['one','one','one','two','two','two','three','three','three'],
['a','b','c','a','b','c','a','b','c']])obj
one a 0.697195
b -0.887408
c 0.451851
two a 0.390779
b -2.058070
c 0.760594
three a -0.305534
b -0.720491
c -0.259225
dtype: float64
obj.index
MultiIndex(levels=[['one', 'three', 'two'], ['a', 'b', 'c']],
labels=[[0, 0, 0, 2, 2, 2, 1, 1, 1], [0, 1, 2, 0, 1, 2, 0, 1, 2]])
obj['two']
a 0.390779
b -2.058070
c 0.760594
dtype: float64
obj[:,'a'] #内层选取
one 0.697195
two 0.390779
three -0.305534
dtype: float64
df = DataFrame(np.arange(16).reshape(4,4),
index
=[['one','one','two','two'],['a','b','a','b']],
columns
=[['apple','apple','orange','orange'],['red','green','red','green']])df


apple

orange





red

green

red

green

one

a

0

1

2

3

b

4

5

6

7


two

a

8

9

10

11

b

12

13

14

15


df['apple']


red

green

one

a

0

1

b

4

5


two

a

8

9

b

12

13


df.swaplevel(0,1)


apple

orange





red

green

red

green

a

one

0

1

2

3

b

one

4

5

6

7

a

two

8

9

10

11

b

two

12

13

14

15

df.sum(level=0)

apple

orange




red

green

red

green

one

4

6

8

10

two

20

22

24

26

df.sum(level=1,axis=1)


green

red

one

a

4

2

b

12

10


two

a

20

18

b

28

26


pandas数据可视化

import numpy as npfrom pandas import Series,DataFrameimport pandas as pdimport matplotlib as mplimport matplotlib.pyplot as plt #导入matplotlib库%matplotlib inline #魔法函数
s = Series(np.random.normal(size=10))s
0 -0.468142
1 -1.408927
2 -0.182548
3 -0.043023
4 0.121437
5 0.539194
6 0.011423
7 -0.938207
8 1.589460
9 0.460753
dtype: float64
s.plot()
<matplotlib.axes._subplots.AxesSubplot at 0xafc5390>

图片

df = DataFrame({'normal': np.random.normal(size=100),
'gamma': np.random.gamma(1, size=100),
'poisson': np.random.poisson(size=100)})df.cumsum()

gamma

normal

poisson

0

1.804045

1.788000

0.0

1

1.835715

0.089426

0.0

2

3.850210

0.870177

0.0

3

6.082898

0.902761

0.0

4

8.837446

0.959945

1.0

5

9.307126

1.658268

3.0

6

9.518029

3.118419

6.0

7

9.758011

3.861418

6.0

8

10.481856

3.405625

6.0

9

12.405202

4.892910

7.0

10

13.086167

4.776206

7.0

11

13.457807

3.217277

8.0

12

13.574663

1.821368

9.0

13

13.695523

2.829581

10.0

14

13.819044

3.015490

11.0

15

15.801080

2.629254

13.0

16

17.043867

2.052196

14.0

17

17.089774

3.687834

15.0

18

17.499338

2.635491

16.0

19

18.257891

2.636466

18.0

20

19.101743

2.272298

19.0

21

24.158020

-0.113947

20.0

22

25.112218

-0.594266

23.0

23

25.986628

-1.326405

23.0

24

28.383365

-1.349211

23.0

25

28.753694

-1.527589

23.0

26

28.908734

-1.312111

25.0

27

30.607696

0.228251

26.0

28

31.081009

1.067429

27.0

29

31.330353

1.098605

28.0

...

...

...

...

70

72.302929

14.123995

66.0

71

72.794689

14.860449

67.0

72

73.629651

14.828726

67.0

73

74.610837

14.168664

68.0

74

78.773897

13.334949

70.0

75

80.916582

13.722037

71.0

76

81.994526

14.717187

72.0

77

83.927355

13.784763

72.0

78

86.004903

13.343261

75.0

79

86.609627

12.151334

75.0

80

87.199249

13.345584

77.0

81

87.213180

12.311815

77.0

82

87.553190

13.864232

77.0

83

89.157662

14.439016

78.0

84

89.213456

14.401503

80.0

85

89.471336

15.838362

81.0

86

89.552332

14.406933

81.0

87

91.565291

14.520602

82.0

88

94.179919

12.017739

82.0

89

95.075841

13.279973

83.0

90

95.192719

13.089789

83.0

91

96.148316

12.268122

84.0

92

97.146898

11.830559

84.0

93

97.456375

13.035484

86.0

94

99.877122

11.966609

87.0

95

103.015620

12.313341

88.0

96

103.116648

12.715195

88.0

97

103.490265

12.168645

89.0

98

103.925893

11.502630

89.0

99

105.008619

11.193637

89.0

100 rows × 3 columns

df.cumsum().plot()
<matplotlib.axes._subplots.AxesSubplot at 0xaef4c18>

图片


data = {
'name':['张三', '李四', '王五', '小明', 'Peter'],
'sex':['female', 'female', 'male', 'male','male'],
'year':[2001, 2001, 2003, 2002, 2002],
'city':['北京', '上海', '广州', '北京', '北京']}df = DataFrame(data)df

city

name

sex

year

0

北京

张三

female

2001

1

上海

李四

female

2001

2

广州

王五

male

2003

3

北京

小明

male

2002

4

北京

Peter

male

2002

df['sex'].value_counts()
male 3
female 2
Name: sex, dtype: int64
df['sex'].value_counts().plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0xaf1ac50>

图片

df2 = DataFrame(np.random.randint(0,100,size=(3,3)),
index
=('one','two','three'),
columns
= ['A','B','C'])df2

A

B

C

one

29

5

88

two

35

42

43

three

87

85

76

df2.plot(kind='barh')
<matplotlib.axes._subplots.AxesSubplot at 0xb5b53c8>

图片

df2.plot(kind='barh',stacked=True,alpha=0.5)
<matplotlib.axes._subplots.AxesSubplot at 0xd576cf8>

图片

s = Series(np.random.normal(size=100))s.hist(bins=20,grid=False)
<matplotlib.axes._subplots.AxesSubplot at 0xcf9f5c0>

图片

s.plot(kind='kde')
<matplotlib.axes._subplots.AxesSubplot at 0xd266710>

图片

df3 = DataFrame(np.arange(10),columns=['X'])df3['Y'] = 2 * df3['X'] + 5df3

X

Y

0

0

5

1

1

7

2

2

9

3

3

11

4

4

13

5

5

15

6

6

17

7

7

19

8

8

21

9

9

23

df3.plot(kind='scatter',x='X',y='Y')
<matplotlib.axes._subplots.AxesSubplot at 0xb1f98d0>

图片

import numpy as npfrom pandas import Series,DataFrameimport pandas as pdimport seaborn as sns #导入seaborn库
tips=sns.load_dataset('tips')tips.head()

total_bill

tip

sex

smoker

day

time

size

0

16.99

1.01

Female

No

Sun

Dinner

2

1

10.34

1.66

Male

No

Sun

Dinner

3

2

21.01

3.50

Male

No

Sun

Dinner

3

3

23.68

3.31

Male

No

Sun

Dinner

2

4

24.59

3.61

Female

No

Sun

Dinner

4

tips.shape
(244, 7)
tips.describe()

total_bill

tip

size

count

244.000000

244.000000

244.000000

mean

19.785943

2.998279

2.569672

std

8.902412

1.383638

0.951100

min

3.070000

1.000000

1.000000

25%

13.347500

2.000000

2.000000

50%

17.795000

2.900000

2.000000

75%

24.127500

3.562500

3.000000

max

50.810000

10.000000

6.000000

tips.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
total_bill 244 non-null float64
tip 244 non-null float64
sex 244 non-null category
smoker 244 non-null category
day 244 non-null category
time 244 non-null category
size 244 non-null int64
dtypes: category(4), float64(2), int64(1)
memory usage: 7.2 KB
tips.plot(kind='scatter',x='total_bill',y='tip')
<matplotlib.axes._subplots.AxesSubplot at 0xe034828>

图片

male_tip = tips[tips['sex'] == 'Male']['tip'].mean()male_tip
3.0896178343949052
female_tip = tips[tips['sex'] == 'Female']['tip'].mean()female_tip
2.833448275862069
s = Series([male_tip,female_tip],index=['male','female'])s
male 3.089618
female 2.833448
dtype: float64
s.plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0xddd27f0>

图片

tips['day'].unique()
[Sun, Sat, Thur, Fri]
Categories (4, object): [Sun, Sat, Thur, Fri]
sun_tip = tips[tips['day'] == 'Sun']['tip'].mean()sat_tip = tips[tips['day'] == 'Sat']['tip'].mean()thur_tip = tips[tips['day'] == 'Thur']['tip'].mean()fri_tip = tips[tips['day'] == 'Fri']['tip'].mean()
s = Series([thur_tip,fri_tip,sat_tip,sun_tip],index=['Thur','Fri','Sat','Sun'])s
Thur 2.771452
Fri 2.734737
Sat 2.993103
Sun 3.255132
dtype: float64
s.plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0xdefe5c0>

图片

tips['percent_tip'] = tips['tip']/(tips['total_bill']+tips['tip'])tips.head(10)

total_bill

tip

sex

smoker

day

time

size

percent_tip

0

16.99

1.01

Female

No

Sun

Dinner

2

0.056111

1

10.34

1.66

Male

No

Sun

Dinner

3

0.138333

2

21.01

3.50

Male

No

Sun

Dinner

3

0.142799

3

23.68

3.31

Male

No

Sun

Dinner

2

0.122638

4

24.59

3.61

Female

No

Sun

Dinner

4

0.128014

5

25.29

4.71

Male

No

Sun

Dinner

4

0.157000

6

8.77

2.00

Male

No

Sun

Dinner

2

0.185701

7

26.88

3.12

Male

No

Sun

Dinner

4

0.104000

8

15.04

1.96

Male

No

Sun

Dinner

2

0.115294

9

14.78

3.23

Male

No

Sun

Dinner

2

0.179345

tips['percent_tip'].hist(bins=50)
<matplotlib.axes._subplots.AxesSubplot at 0xe264710>

图片

    本站是提供个人知识管理的网络存储空间,所有内容均由用户发布,不代表本站观点。请注意甄别内容中的联系方式、诱导购买等信息,谨防诈骗。如发现有害或侵权内容,请点击一键举报。
    转藏 分享 献花(0

    0条评论

    发表

    请遵守用户 评论公约