话不多说,直接上代码,全部可在jupyter notebook 里运行。 from pandas import Series , DataFrame import pandas as pd
obj = Series ( [ 1 , - 2 , 3 , - 4 ] ) obj
0 1 1 -2 2 3 3 -4 dtype: int64
obj2 = Series ( [ 1 , - 2 , 3 , - 4 ] , index = [ 'a' , 'b' , 'c' , 'd' ] ) obj2
a 1 b -2 c 3 d -4 dtype: int64
obj2 . values
array([ 1, -2, 3, -4], dtype=int64)
obj2 . index
Index(['a', 'b', 'c', 'd'], dtype='object')
obj2 [ 'b' ]
-2
obj2 [ 'c' ] = 23 obj2 [ [ 'c' , 'd' ] ]
c 23 d -4 dtype: int64
obj2
a 1 b -2 c 23 d -4 dtype: int64
obj2 [ obj2 < 0 ]
b -2 d -4 dtype: int64
obj2 * 2
a 2 b -4 c 46 d -8 dtype: int64
import numpy as np
np . abs ( obj2 )
a 1 b 2 c 23 d 4 dtype: int64
data = { '张三' : 92 , '李四' : 78 , '王五' : 68 , '小明' : 82 }
obj3 = Series ( data ) obj3
小明 82 张三 92 李四 78 王五 68 dtype: int64
names = [ '张三' , '李四' , '王五' , '小明' ] obj4 = Series ( data , index = names ) obj4
张三 92 李四 78 王五 68 小明 82 dtype: int64
obj4 . name = 'math' obj4 . index . name = 'students'
obj4
students 张三 92 李四 78 王五 68 小明 82 Name: math, dtype: int64
dataframe
import numpy as np from pandas import Series , DataFrame import pandas as pd
data = { 'name' : [ '张三' , '李四' , '王五' , '小明' ] , 'sex' : [ 'female' , 'female' , 'male' , 'male' ] , 'year' : [ 2001 , 2001 , 2003 , 2002 ] , 'city' : [ '北京' , '上海' , '广州' , '北京' ] } df = DataFrame ( data ) df
city
name
sex
year
0
北京
张三
female
2001
1
上海
李四
female
2001
2
广州
王五
male
2003
3
北京
小明
male
2002
df = DataFrame ( data , columns = [ 'name' , 'sex' , 'year' , 'city' ] ) df
name
sex
year
city
0
张三
female
2001
北京
1
李四
female
2001
上海
2
王五
male
2003
广州
3
小明
male
2002
北京
df = DataFrame ( data , columns = [ 'name' , 'sex' , 'year' , 'city' ] , index = [ 'a' , 'b' , 'c' , 'd' ] ) df
name
sex
year
city
a
张三
female
2001
北京
b
李四
female
2001
上海
c
王五
male
2003
广州
d
小明
male
2002
北京
df . index
Index(['a', 'b', 'c', 'd'], dtype='object')
df . columns
Index(['name', 'sex', 'year', 'city'], dtype='object')
data2 = { 'sex' : { '张三' : 'female' , '李四' : 'female' , '王五' : 'male' } , 'city' : { '张三' : '北京' , '李四' : '上海' , '王五' : '广州' } } df2 = DataFrame ( data2 ) df2
city
sex
张三
北京
female
李四
上海
female
王五
广州
male
df . index . name = 'id' df . columns . name = 'std_info'
df
std_info
name
sex
year
city
id
a
张三
female
2001
北京
b
李四
female
2001
上海
c
王五
male
2003
广州
d
小明
male
2002
北京
obj = Series ( [ 1 , - 2 , 3 , - 4 ] , index = [ 'a' , 'b' , 'c' , 'd' ] ) obj
a 1 b -2 c 3 d -4 dtype: int64
obj . index
Index(['a', 'b', 'c', 'd'], dtype='object')
df . index
Index(['a', 'b', 'c', 'd'], dtype='object', name='id')
df . columns
Index(['name', 'sex', 'year', 'city'], dtype='object', name='std_info')
index = obj . index index [ 1 ] = 'f'
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-14-4f995da5e969> in <module>() 1 index = obj.index ----> 2 index[1] = 'f' F:\Anaconda\envs\data-analysis\lib\site-packages\pandas\core\indexes\base.py in __setitem__(self, key, value) 1668 1669 def __setitem__(self, key, value): -> 1670 raise TypeError("Index does not support mutable operations") 1671 1672 def __getitem__(self, key): TypeError: Index does not support mutable operations
df
std_info
name
sex
year
city
id
a
张三
female
2001
北京
b
李四
female
2001
上海
c
王五
male
2003
广州
d
小明
male
2002
北京
'sex' in df . columns
True
'f' in df . index
False
obj = Series ( [ 1 , - 2 , 3 , - 4 ] , index = [ 'b' , 'a' , 'c' , 'd' ] ) obj
b 1 a -2 c 3 d -4 dtype: int64
obj2 = obj . reindex ( [ 'a' , 'b' , 'c' , 'd' , 'e' ] ) obj2
a -2.0 b 1.0 c 3.0 d -4.0 e NaN dtype: float64
obj = Series ( [ 1 , - 2 , 3 , - 4 ] , index = [ 0 , 2 , 3 , 5 ] ) obj
0 1 2 -2 3 3 5 -4 dtype: int64
obj2 = obj . reindex ( range ( 6 ) , method = 'ffill' ) obj2
0 1 1 1 2 -2 3 3 4 3 5 -4 dtype: int64
df = DataFrame ( np . arange ( 9 ) . reshape ( 3 , 3 ) , index = [ 'a' , 'c' , 'd' ] , columns = [ 'name' , 'id' , 'sex' ] ) df
df2 = df . reindex ( [ 'a' , 'b' , 'c' , 'd' ] ) df2
name
id
sex
a
0.0
1.0
2.0
b
NaN
NaN
NaN
c
3.0
4.0
5.0
d
6.0
7.0
8.0
df3 = df . reindex ( columns = [ 'name' , 'year' , 'id' ] , fill_value = 0 ) df3
data = { 'name' : [ '张三' , '李四' , '王五' , '小明' ] , 'grade' : [ 68 , 78 , 63 , 92 ] } df = DataFrame ( data ) df
grade
name
0
68
张三
1
78
李四
2
63
王五
3
92
小明
df2 = df . sort_values ( by = 'grade' ) df2
grade
name
2
63
王五
0
68
张三
1
78
李四
3
92
小明
df3 = df2 . reset_index ( ) df3
index
grade
name
0
2
63
王五
1
0
68
张三
2
1
78
李四
3
3
92
小明
df4 = df2 . reset_index ( drop = True ) df4
grade
name
0
63
王五
1
68
张三
2
78
李四
3
92
小明
data = { 'name' : [ '张三' , '李四' , '王五' , '小明' ] , 'sex' : [ 'female' , 'female' , 'male' , 'male' ] , 'year' : [ 2001 , 2001 , 2003 , 2002 ] , 'city' : [ '北京' , '上海' , '广州' , '北京' ] } df = DataFrame ( data ) df
city
name
sex
year
0
北京
张三
female
2001
1
上海
李四
female
2001
2
广州
王五
male
2003
3
北京
小明
male
2002
df2 = df . set_index ( 'name' ) df2
city
sex
year
name
张三
北京
female
2001
李四
上海
female
2001
王五
广州
male
2003
小明
北京
male
2002
df3 = df2 . reset_index ( ) df3
name
city
sex
year
0
张三
北京
female
2001
1
李四
上海
female
2001
2
王五
广州
male
2003
3
小明
北京
male
2002
索引和选取
import numpy as np from pandas import Series , DataFrame import pandas as pd
obj = Series ( [ 1 , - 2 , 3 , - 4 ] , index = [ 'a' , 'b' , 'c' , 'd' ] ) obj
a 1 b -2 c 3 d -4 dtype: int64
obj [ 1 ]
-2
obj [ 'b' ]
-2
obj [ [ 'a' , 'c' ] ]
a 1 c 3 dtype: int64
obj [ 0 : 2 ]
a 1 b -2 dtype: int64
obj [ 'a' : 'c' ]
a 1 b -2 c 3 dtype: int64
data = { 'name' : [ '张三' , '李四' , '王五' , '小明' ] , 'sex' : [ 'female' , 'female' , 'male' , 'male' ] , 'year' : [ 2001 , 2001 , 2003 , 2002 ] , 'city' : [ '北京' , '上海' , '广州' , '北京' ] } df = DataFrame ( data ) df
city
name
sex
year
0
北京
张三
female
2001
1
上海
李四
female
2001
2
广州
王五
male
2003
3
北京
小明
male
2002
df [ 'city' ]
0 北京 1 上海 2 广州 3 北京 Name: city, dtype: object
df . name
0 张三 1 李四 2 王五 3 小明 Name: name, dtype: object
df [ [ 'city' , 'sex' ] ]
city
sex
0
北京
female
1
上海
female
2
广州
male
3
北京
male
df2 = df . set_index ( 'name' ) df2
city
sex
year
name
张三
北京
female
2001
李四
上海
female
2001
王五
广州
male
2003
小明
北京
male
2002
df2 [ 0 : 2 ]
city
sex
year
name
张三
北京
female
2001
李四
上海
female
2001
df2 [ '李四' : '王五' ]
city
sex
year
name
李四
上海
female
2001
王五
广州
male
2003
df2
city
sex
year
name
张三
北京
female
2001
李四
上海
female
2001
王五
广州
male
2003
小明
北京
male
2002
df2 . loc [ '张三' ]
city 北京 sex female year 2001 Name: 张三, dtype: object
df2 . loc [ [ '张三' , '王五' ] ]
city
sex
year
name
张三
北京
female
2001
王五
广州
male
2003
df2 . iloc [ 1 ]
city 上海 sex female year 2001 Name: 李四, dtype: object
df2 . iloc [ [ 1 , 3 ] ]
city
sex
year
name
李四
上海
female
2001
小明
北京
male
2002
df2 . ix [ [ '张三' , '王五' ] , 0 : 2 ]
city
sex
name
张三
北京
female
王五
广州
male
pd . set_option ( 'mode.chained_assignment' , None )
df2 . ix [ : , [ 'sex' , 'year' ] ] #获取列
sex
year
name
张三
female
2001
李四
female
2001
王五
male
2003
小明
male
2002
df2 . ix [ [ 1 , 3 ] , : ] #获取行
city
sex
year
name
李四
上海
female
2001
小明
北京
male
2002
df2 [ 'sex' ] == 'female'
name 张三 True 李四 True 王五 False 小明 False Name: sex, dtype: bool
df2 [ df2 [ 'sex' ] == 'female' ]
city
sex
year
name
张三
北京
female
2001
李四
上海
female
2001
df2 [ ( df2 [ 'sex' ] == 'female' ) & ( df2 [ 'city' ] == '北京' ) ]
city
sex
year
name
张三
北京
female
2001
行和列的操作
df
city
name
sex
year
0
北京
张三
female
2001
1
上海
李四
female
2001
2
广州
王五
male
2003
3
北京
小明
male
2002
new_data = { 'city' : '武汉' , 'name' : '小李' , 'sex' : 'male' , 'year' : 2002 }
df = df . append ( new_data , ignore_index = True ) #忽略索引值 df
city
name
sex
year
0
北京
张三
female
2001
1
上海
李四
female
2001
2
广州
王五
male
2003
3
北京
小明
male
2002
4
武汉
小李
male
2002
df [ 'class' ] = 2018 df
city
name
sex
year
class
0
北京
张三
female
2001
2018
1
上海
李四
female
2001
2018
2
广州
王五
male
2003
2018
3
北京
小明
male
2002
2018
4
武汉
小李
male
2002
2018
df [ 'math' ] = [ 92 , 78 , 58 , 69 , 82 ] df
city
name
sex
year
class
math
0
北京
张三
female
2001
2018
92
1
上海
李四
female
2001
2018
78
2
广州
王五
male
2003
2018
58
3
北京
小明
male
2002
2018
69
4
武汉
小李
male
2002
2018
82
new_df = df . drop ( 2 ) #删除行 new_df
city
name
sex
year
class
math
0
北京
张三
female
2001
2018
92
1
上海
李四
female
2001
2018
78
3
北京
小明
male
2002
2018
69
4
武汉
小李
male
2002
2018
82
new_df = new_df . drop ( 'class' , axis = 1 ) #删除列 new_df
city
name
sex
year
math
0
北京
张三
female
2001
92
1
上海
李四
female
2001
78
3
北京
小明
male
2002
69
4
武汉
小李
male
2002
82
new_df . rename ( index = { 3 : 2 , 4 : 3 } , columns = { 'math' : 'Math' } , inplace = True ) #inplace可在原数据上修改 new_df
city
name
sex
year
Math
0
北京
张三
female
2001
92
1
上海
李四
female
2001
78
2
北京
小明
male
2002
69
3
武汉
小李
male
2002
82
obj1 = Series ( [ 3.2 , 5.3 , - 4.4 , - 3.7 ] , index = [ 'a' , 'c' , 'g' , 'f' ] ) obj1
a 3.2 c 5.3 g -4.4 f -3.7 dtype: float64
obj2 = Series ( [ 5.0 , - 2 , 4.4 , 3.4 ] , index = [ 'a' , 'b' , 'c' , 'd' ] ) obj2
a 5.0 b -2.0 c 4.4 d 3.4 dtype: float64
obj1 + obj2
a 8.2 b NaN c 9.7 d NaN f NaN g NaN dtype: float64
df1 = DataFrame ( np . arange ( 9 ) . reshape ( 3 , 3 ) , columns = [ 'a' , 'b' , 'c' ] , index = [ 'apple' , 'tea' , 'banana' ] ) df1
a
b
c
apple
0
1
2
tea
3
4
5
banana
6
7
8
df2 = DataFrame ( np . arange ( 9 ) . reshape ( 3 , 3 ) , columns = [ 'a' , 'b' , 'd' ] , index = [ 'apple' , 'tea' , 'coco' ] ) df2
df1 + df2
a
b
c
d
apple
0.0
2.0
NaN
NaN
banana
NaN
NaN
NaN
NaN
coco
NaN
NaN
NaN
NaN
tea
6.0
8.0
NaN
NaN
df1
a
b
c
apple
0
1
2
tea
3
4
5
banana
6
7
8
s = df1 . ix [ 'apple' ] s
a 0 b 1 c 2 Name: apple, dtype: int32
df1 - s
a
b
c
apple
0
0
0
tea
3
3
3
banana
6
6
6
data = { 'fruit' : [ 'apple' , 'orange' , 'grape' , 'banana' ] , 'price' : [ '25元' , '42元' , '35元' , '14元' ] } df1 = DataFrame ( data ) df1
fruit
price
0
apple
25元
1
orange
42元
2
grape
35元
3
banana
14元
def f ( x ) : return x . split ( '元' ) [ 0 ] df1 [ 'price' ] = df1 [ 'price' ] . map ( f ) df1
fruit
price
0
apple
25
1
orange
42
2
grape
35
3
banana
14
df2 = DataFrame ( np . random . randn ( 3 , 3 ) , columns = [ 'a' , 'b' , 'c' ] , index = [ 'app' , 'win' , 'mac' ] ) df2
a
b
c
app
1.507962
-2.140018
0.053571
win
0.729671
0.207060
0.397773
mac
-0.191497
-0.765726
-0.266327
f = lambda x : x . max ( ) - x . min ( ) df2 . apply ( f )
a 1.699460 b 2.347079 c 0.664100 dtype: float64
df2
a
b
c
app
1.507962
-2.140018
0.053571
win
0.729671
0.207060
0.397773
mac
-0.191497
-0.765726
-0.266327
df2 . applymap ( lambda x : '%.2f' % x )
a
b
c
app
1.51
-2.14
0.05
win
0.73
0.21
0.40
mac
-0.19
-0.77
-0.27
obj1 = Series ( [ - 2 , 3 , 2 , 1 ] , index = [ 'b' , 'a' , 'd' , 'c' ] ) obj1
b -2 a 3 d 2 c 1 dtype: int64
obj1 . sort_index ( ) #升序
a 3 b -2 c 1 d 2 dtype: int64
obj1 . sort_index ( ascending = False ) #降序
d 2 c 1 b -2 a 3 dtype: int64
obj1 . sort_values ( )
b -2 c 1 d 2 a 3 dtype: int64
df2
a
b
c
app
1.507962
-2.140018
0.053571
win
0.729671
0.207060
0.397773
mac
-0.191497
-0.765726
-0.266327
df2 . sort_values ( by = 'b' )
a
b
c
app
1.507962
-2.140018
0.053571
mac
-0.191497
-0.765726
-0.266327
win
0.729671
0.207060
0.397773
df = DataFrame ( np . random . randn ( 9 ) . reshape ( 3 , 3 ) , columns = [ 'a' , 'b' , 'c' ] ) df
a
b
c
0
0.660215
-1.137716
-0.302954
1
1.496589
-0.768645
-2.091506
2
0.170316
-2.682284
-0.041099
df . sum ( )
a 2.327120 b -4.588645 c -2.435558 dtype: float64
df . sum ( axis = 1 )
0 -0.780455 1 -1.363562 2 -2.553067 dtype: float64
data = { 'name' : [ '张三' , '李四' , '王五' , '小明' ] , 'sex' : [ 'female' , 'female' , 'male' , 'male' ] , 'math' : [ 78 , 79 , 83 , 92 ] , 'city' : [ '北京' , '上海' , '广州' , '北京' ] } df = DataFrame ( data ) df
city
math
name
sex
0
北京
78
张三
female
1
上海
79
李四
female
2
广州
83
王五
male
3
北京
92
小明
male
df . describe ( )
math
count
4.000000
mean
83.000000
std
6.377042
min
78.000000
25%
78.750000
50%
81.000000
75%
85.250000
max
92.000000
obj = Series ( [ 'a' , 'b' , 'a' , 'c' , 'b' ] ) obj
0 a 1 b 2 a 3 c 4 b dtype: object
obj . unique ( )
array(['a', 'b', 'c'], dtype=object)
obj . value_counts ( )
a 2 b 2 c 1 dtype: int64
obj = Series ( np . random . randn ( 9 ) , index= [ [ 'one' , 'one' , 'one' , 'two' , 'two' , 'two' , 'three' , 'three' , 'three' ] , [ 'a' , 'b' , 'c' , 'a' , 'b' , 'c' , 'a' , 'b' , 'c' ] ] ) obj
one a 0.697195 b -0.887408 c 0.451851 two a 0.390779 b -2.058070 c 0.760594 three a -0.305534 b -0.720491 c -0.259225 dtype: float64
obj . index
MultiIndex(levels=[['one', 'three', 'two'], ['a', 'b', 'c']], labels=[[0, 0, 0, 2, 2, 2, 1, 1, 1], [0, 1, 2, 0, 1, 2, 0, 1, 2]])
obj [ 'two' ]
a 0.390779 b -2.058070 c 0.760594 dtype: float64
obj [ : , 'a' ] #内层选取
one 0.697195 two 0.390779 three -0.305534 dtype: float64
df = DataFrame ( np . arange ( 16 ) . reshape ( 4 , 4 ) , index= [ [ 'one' , 'one' , 'two' , 'two' ] , [ 'a' , 'b' , 'a' , 'b' ] ] , columns= [ [ 'apple' , 'apple' , 'orange' , 'orange' ] , [ 'red' , 'green' , 'red' , 'green' ] ] ) df
apple
orange
red
green
red
green
one
a
0
1
2
3
b
4
5
6
7
two
a
8
9
10
11
b
12
13
14
15
df [ 'apple' ]
red
green
one
a
0
1
b
4
5
two
a
8
9
b
12
13
df . swaplevel ( 0 , 1 )
apple
orange
red
green
red
green
a
one
0
1
2
3
b
one
4
5
6
7
a
two
8
9
10
11
b
two
12
13
14
15
df . sum ( level = 0 )
apple
orange
red
green
red
green
one
4
6
8
10
two
20
22
24
26
df . sum ( level = 1 , axis = 1 )
green
red
one
a
4
2
b
12
10
two
a
20
18
b
28
26
pandas数据可视化
import numpy as np from pandas import Series , DataFrame import pandas as pd import matplotlib as mpl import matplotlib . pyplot as plt #导入matplotlib库 % matplotlib inline #魔法函数
s = Series ( np . random . normal ( size = 10 ) ) s
0 -0.468142 1 -1.408927 2 -0.182548 3 -0.043023 4 0.121437 5 0.539194 6 0.011423 7 -0.938207 8 1.589460 9 0.460753 dtype: float64
s . plot ( )
<matplotlib.axes._subplots.AxesSubplot at 0xafc5390>
df = DataFrame ( { 'normal' : np . random . normal ( size = 100 ) , 'gamma' : np . random . gamma ( 1 , size = 100 ) , 'poisson' : np . random . poisson ( size = 100 ) } ) df . cumsum ( )
gamma
normal
poisson
0
1.804045
1.788000
0.0
1
1.835715
0.089426
0.0
2
3.850210
0.870177
0.0
3
6.082898
0.902761
0.0
4
8.837446
0.959945
1.0
5
9.307126
1.658268
3.0
6
9.518029
3.118419
6.0
7
9.758011
3.861418
6.0
8
10.481856
3.405625
6.0
9
12.405202
4.892910
7.0
10
13.086167
4.776206
7.0
11
13.457807
3.217277
8.0
12
13.574663
1.821368
9.0
13
13.695523
2.829581
10.0
14
13.819044
3.015490
11.0
15
15.801080
2.629254
13.0
16
17.043867
2.052196
14.0
17
17.089774
3.687834
15.0
18
17.499338
2.635491
16.0
19
18.257891
2.636466
18.0
20
19.101743
2.272298
19.0
21
24.158020
-0.113947
20.0
22
25.112218
-0.594266
23.0
23
25.986628
-1.326405
23.0
24
28.383365
-1.349211
23.0
25
28.753694
-1.527589
23.0
26
28.908734
-1.312111
25.0
27
30.607696
0.228251
26.0
28
31.081009
1.067429
27.0
29
31.330353
1.098605
28.0
...
...
...
...
70
72.302929
14.123995
66.0
71
72.794689
14.860449
67.0
72
73.629651
14.828726
67.0
73
74.610837
14.168664
68.0
74
78.773897
13.334949
70.0
75
80.916582
13.722037
71.0
76
81.994526
14.717187
72.0
77
83.927355
13.784763
72.0
78
86.004903
13.343261
75.0
79
86.609627
12.151334
75.0
80
87.199249
13.345584
77.0
81
87.213180
12.311815
77.0
82
87.553190
13.864232
77.0
83
89.157662
14.439016
78.0
84
89.213456
14.401503
80.0
85
89.471336
15.838362
81.0
86
89.552332
14.406933
81.0
87
91.565291
14.520602
82.0
88
94.179919
12.017739
82.0
89
95.075841
13.279973
83.0
90
95.192719
13.089789
83.0
91
96.148316
12.268122
84.0
92
97.146898
11.830559
84.0
93
97.456375
13.035484
86.0
94
99.877122
11.966609
87.0
95
103.015620
12.313341
88.0
96
103.116648
12.715195
88.0
97
103.490265
12.168645
89.0
98
103.925893
11.502630
89.0
99
105.008619
11.193637
89.0
100 rows × 3 columns
df . cumsum ( ) . plot ( )
<matplotlib.axes._subplots.AxesSubplot at 0xaef4c18>
data = { 'name' : [ '张三' , '李四' , '王五' , '小明' , 'Peter' ] , 'sex' : [ 'female' , 'female' , 'male' , 'male' , 'male' ] , 'year' : [ 2001 , 2001 , 2003 , 2002 , 2002 ] , 'city' : [ '北京' , '上海' , '广州' , '北京' , '北京' ] } df = DataFrame ( data ) df
city
name
sex
year
0
北京
张三
female
2001
1
上海
李四
female
2001
2
广州
王五
male
2003
3
北京
小明
male
2002
4
北京
Peter
male
2002
df [ 'sex' ] . value_counts ( )
male 3 female 2 Name: sex, dtype: int64
df [ 'sex' ] . value_counts ( ) . plot ( kind = 'bar' )
<matplotlib.axes._subplots.AxesSubplot at 0xaf1ac50>
df2 = DataFrame ( np . random . randint ( 0 , 100 , size = ( 3 , 3 ) ) , index= ( 'one' , 'two' , 'three' ) , columns = [ 'A' , 'B' , 'C' ] ) df2
A
B
C
one
29
5
88
two
35
42
43
three
87
85
76
df2 . plot ( kind = 'barh' )
<matplotlib.axes._subplots.AxesSubplot at 0xb5b53c8>
df2 . plot ( kind = 'barh' , stacked = True , alpha = 0.5 )
<matplotlib.axes._subplots.AxesSubplot at 0xd576cf8>
s = Series ( np . random . normal ( size = 100 ) ) s . hist ( bins = 20 , grid = False )
<matplotlib.axes._subplots.AxesSubplot at 0xcf9f5c0>
s . plot ( kind = 'kde' )
<matplotlib.axes._subplots.AxesSubplot at 0xd266710>
df3 = DataFrame ( np . arange ( 10 ) , columns = [ 'X' ] ) df3 [ 'Y' ] = 2 * df3 [ 'X' ] + 5 df3
X
Y
0
0
5
1
1
7
2
2
9
3
3
11
4
4
13
5
5
15
6
6
17
7
7
19
8
8
21
9
9
23
df3 . plot ( kind = 'scatter' , x = 'X' , y = 'Y' )
<matplotlib.axes._subplots.AxesSubplot at 0xb1f98d0>
import numpy as np from pandas import Series , DataFrame import pandas as pd import seaborn as sns #导入seaborn库
tips = sns . load_dataset ( 'tips' ) tips . head ( )
total_bill
tip
sex
smoker
day
time
size
0
16.99
1.01
Female
No
Sun
Dinner
2
1
10.34
1.66
Male
No
Sun
Dinner
3
2
21.01
3.50
Male
No
Sun
Dinner
3
3
23.68
3.31
Male
No
Sun
Dinner
2
4
24.59
3.61
Female
No
Sun
Dinner
4
tips . shape
(244, 7)
tips . describe ( )
total_bill
tip
size
count
244.000000
244.000000
244.000000
mean
19.785943
2.998279
2.569672
std
8.902412
1.383638
0.951100
min
3.070000
1.000000
1.000000
25%
13.347500
2.000000
2.000000
50%
17.795000
2.900000
2.000000
75%
24.127500
3.562500
3.000000
max
50.810000
10.000000
6.000000
tips . info ( )
<class 'pandas.core.frame.DataFrame'> RangeIndex: 244 entries, 0 to 243 Data columns (total 7 columns): total_bill 244 non-null float64 tip 244 non-null float64 sex 244 non-null category smoker 244 non-null category day 244 non-null category time 244 non-null category size 244 non-null int64 dtypes: category(4), float64(2), int64(1) memory usage: 7.2 KB
tips . plot ( kind = 'scatter' , x = 'total_bill' , y = 'tip' )
<matplotlib.axes._subplots.AxesSubplot at 0xe034828>
male_tip = tips [ tips [ 'sex' ] == 'Male' ] [ 'tip' ] . mean ( ) male_tip
3.0896178343949052
female_tip = tips [ tips [ 'sex' ] == 'Female' ] [ 'tip' ] . mean ( ) female_tip
2.833448275862069
s = Series ( [ male_tip , female_tip ] , index = [ 'male' , 'female' ] ) s
male 3.089618 female 2.833448 dtype: float64
s . plot ( kind = 'bar' )
<matplotlib.axes._subplots.AxesSubplot at 0xddd27f0>
tips [ 'day' ] . unique ( )
[Sun, Sat, Thur, Fri] Categories (4, object): [Sun, Sat, Thur, Fri]
sun_tip = tips [ tips [ 'day' ] == 'Sun' ] [ 'tip' ] . mean ( ) sat_tip = tips [ tips [ 'day' ] == 'Sat' ] [ 'tip' ] . mean ( ) thur_tip = tips [ tips [ 'day' ] == 'Thur' ] [ 'tip' ] . mean ( ) fri_tip = tips [ tips [ 'day' ] == 'Fri' ] [ 'tip' ] . mean ( )
s = Series ( [ thur_tip , fri_tip , sat_tip , sun_tip ] , index = [ 'Thur' , 'Fri' , 'Sat' , 'Sun' ] ) s
Thur 2.771452 Fri 2.734737 Sat 2.993103 Sun 3.255132 dtype: float64
s . plot ( kind = 'bar' )
<matplotlib.axes._subplots.AxesSubplot at 0xdefe5c0>
tips [ 'percent_tip' ] = tips [ 'tip' ] / ( tips [ 'total_bill' ] + tips [ 'tip' ] ) tips . head ( 10 )
total_bill
tip
sex
smoker
day
time
size
percent_tip
0
16.99
1.01
Female
No
Sun
Dinner
2
0.056111
1
10.34
1.66
Male
No
Sun
Dinner
3
0.138333
2
21.01
3.50
Male
No
Sun
Dinner
3
0.142799
3
23.68
3.31
Male
No
Sun
Dinner
2
0.122638
4
24.59
3.61
Female
No
Sun
Dinner
4
0.128014
5
25.29
4.71
Male
No
Sun
Dinner
4
0.157000
6
8.77
2.00
Male
No
Sun
Dinner
2
0.185701
7
26.88
3.12
Male
No
Sun
Dinner
4
0.104000
8
15.04
1.96
Male
No
Sun
Dinner
2
0.115294
9
14.78
3.23
Male
No
Sun
Dinner
2
0.179345
tips [ 'percent_tip' ] . hist ( bins = 50 )
<matplotlib.axes._subplots.AxesSubplot at 0xe264710>