一、表格创建的几种方法
import pandas as pd
import numpy as np
#1.pd.read_csv方法
data=pd.read_csv(r"G:\大数据实验数据库\3.大数据实验数据\meal_order_info.csv",encoding="GBK")
#2.pd.read_sql_table方法
import sqlalchemy
conn=sqlalchemy.create_engine(r"sqlite:///G:\大数据实验数据库\0.engine数据库\tesss.db")
pd.read_sql_table("meal_order_detail1",conn)
#3.pd.DataFrame方法
t3=pd.DataFrame([],columns=["a","b","c"],index=[1,2,3])
for i in t3.columns:
for j in t3.index:
t3.loc[j,i]=0
print(t3)
二、表格合并的几种方法
1.concat方法
import pandas as pd
import numpy as np
tx=pd.DataFrame([],columns=list("ABCD"),index=range(1,5))
for i in tx.columns:
for j in tx.index:
tx.loc[j,i]=i+str(j)
ty=pd.DataFrame([],columns=list("BDF"),index=[2,4,6,8])
for i in ty.columns:
for j in ty.index:
ty.loc[j,i]=i+str(j)
#1.axis=0 表格上下连接 inner交集 outer并集
t1=pd.concat([tx,ty],axis=0,join="inner",sort=True)
t2=pd.concat([tx,ty],axis=0,join="outer",sort=True)
#2.axis=1 表格左右连接
t3=pd.concat([tx,ty],axis=1,join="inner",sort=True)
t4=pd.concat([tx,ty],axis=1,join="outer",sort=True)
2.append方法(上下连接)
import pandas as pd
import numpy as np
px1=tx.iloc[:,2:]
px2=pd.DataFrame([],columns=["A","B"],index=[5])
px1.append(px2)
3.merge方法(相同关键字合并)
import pandas as pd
import numpy as np
#1.关键字不在同一行合并
tx=pd.DataFrame([],columns=["A","B","Key"],index=range(1,5))
for i in tx.columns:
for j in tx.index:
tx.loc[j,i]=i+str(j)
tx2=pd.DataFrame([],columns=["C","D","Key"],index=range(1,5))
for i in tx2.columns:
for j in tx2.index:
tx2.loc[j,i]=i+str(j)
pd.merge(tx,tx2,left_on="Key",right_on="Key")
#pd.merge(tx2,tx,left_on="Key",right_on="Key")
4.combine方法(数据补充)
t8=pd.DataFrame([[3,5],[4.6,np.nan]],columns=[0,1],index=[0,1])
t9=pd.DataFrame([[np.nan,np.nan],[3,4.0]],columns=[0,1],index=[0,1])
t10=t8.combine_first(t9)
print(t8)
print("**************")
print(t9)
print("**************")
print(t10)
三、loc和iloc的区别
import pandas as pd
import numpy as np
data=pd.read_csv(r"G:\大数据实验数据库\3.大数据实验数据\meal_order_info.csv",encoding="GBK")
#1.loc用行列的名称表示表格数据
data.loc[3,"info_id"]
data.loc[3:5,["info_id","emp_id"]]
#2.用(x,y)纯数字表示表格数据
data.iloc[1,1]
data.iloc[:,:2]
四、去重
import pandas as pd
import numpy as np
data=pd.read_csv(r"G:\大数据实验数据库\3.大数据实验数据\meal_order_info.csv",encoding="GBK")
print(len(data["info_id"]))
#1.drop_duplicates去重
print(len(data["info_id"].drop_duplicates()))
#2.set去重(数据的顺序会改变)
print(len(set(data["info_id"])))
#3.多列去重
print(len(data.drop_duplicates(subset=["info_id","emp_id"])))
补充:fillna方法(当里面没有数,默认填写)
tx=pd.DataFrame([],columns=list("ABCD"),index=range(1,5))
tx.fillna(99)
|