df2 = pd.DataFrame({'A':1.,'B': pd.Timestamp('20130102'),'C': pd.Series(1, index=list(range(4)), dtype='float32'),'D': np.array([3]*4, dtype='int32'),'E': pd.Categorical(["test","train","test","train"]),'F':'foo'})print(df2)
A B C D E F
0 1.0 2013-01-02 1.0 3 test foo
1 1.0 2013-01-02 1.0 3 train foo
2 1.0 2013-01-02 1.0 3 test foo
3 1.0 2013-01-02 1.0 3 train foo
#DataFrame每一列的數(shù)據(jù)類型都不同:print(df2.dtypes)
A float64
B datetime64[ns]
C float32
D int32
E category
F object
dtype: object
print(df.describe())
A B C D
count 6.000000 6.000000 6.000000 6.000000
mean 0.831214 -0.222281 0.168970 0.341981
std 0.508670 0.998449 1.264150 1.008979
min 0.366885 -1.617911 -1.772022 -1.535794
25% 0.517139 -0.925609 -0.320225 0.287826
50% 0.666971 0.142311 0.079598 0.449776
75% 0.965798 0.257888 0.950632 0.991172
max 1.753858 0.984771 1.839167 1.282650
T表示矩陣的轉(zhuǎn)置:
print(df.T)
2019-12-14 2019-12-15 2019-12-16 2019-12-17 2019-12-18 2019-12-19
A 0.616569 0.483995 0.366885 0.717372 1.753858 1.048606
B 0.051401 -1.251278 -1.617911 0.233221 0.266110 0.984771
C 1.121409 -1.772022 0.438302 1.839167 -0.333931 -0.279106
D 0.307267 1.282650 1.124135 -1.535794 0.281345 0.592285
sort_index按照軸排序,axis=1表示第二個維度,即按照列索引排序:
print(df.sort_index(axis=1, ascending=False))
D C B A
2019-12-14 0.307267 1.121409 0.051401 0.616569
2019-12-15 1.282650 -1.772022 -1.251278 0.483995
2019-12-16 1.124135 0.438302 -1.617911 0.366885
2019-12-17 -1.535794 1.839167 0.233221 0.717372
2019-12-18 0.281345 -0.333931 0.266110 1.753858
2019-12-19 0.592285 -0.279106 0.984771 1.048606
sort_values按照值排序,by='B’表示對第二列排序:
print(df.sort_values(by='B'))
A B C D
2019-12-16 0.366885 -1.617911 0.438302 1.124135
2019-12-15 0.483995 -1.251278 -1.772022 1.282650
2019-12-14 0.616569 0.051401 1.121409 0.307267
2019-12-17 0.717372 0.233221 1.839167 -1.535794
2019-12-18 1.753858 0.266110 -0.333931 0.281345
2019-12-19 1.048606 0.984771 -0.279106 0.592285
3 選擇
3.1 獲取行、列
與numpy類似,DataFrame也可以用[]進行選擇:
print(df['A'],'\n')print(df[0:3],'\n')#行切片print(df['20191216':'20191219'],'\n')
2019-12-14 0.616569
2019-12-15 0.483995
2019-12-16 0.366885
2019-12-17 0.717372
2019-12-18 1.753858
2019-12-19 1.048606
Freq: D, Name: A, dtype: float64 A B C D
2019-12-14 0.616569 0.051401 1.121409 0.307267
2019-12-15 0.483995 -1.251278 -1.772022 1.282650
2019-12-16 0.366885 -1.617911 0.438302 1.124135 A B C D
2019-12-16 0.366885 -1.617911 0.438302 1.124135
2019-12-17 0.717372 0.233221 1.839167 -1.535794
2019-12-18 1.753858 0.266110 -0.333931 0.281345
2019-12-19 1.048606 0.984771 -0.279106 0.592285
?
通過label進行選擇
print(df.loc[dates[0]],'\n')print(df.loc[:,['A','B']],'\n')print(df.loc['20191216':'20191218',['A','B']],'\n')print(df.at[dates[0],'A'],'\n')#選擇單個數(shù)據(jù)時用at,速度更快
A 0.616569
B 0.051401
C 1.121409
D 0.307267
Name: 2019-12-14 00:00:00, dtype: float64 A B
2019-12-14 0.616569 0.051401
2019-12-15 0.483995 -1.251278
2019-12-16 0.366885 -1.617911
2019-12-17 0.717372 0.233221
2019-12-18 1.753858 0.266110
2019-12-19 1.048606 0.984771 A B
2019-12-16 0.366885 -1.617911
2019-12-17 0.717372 0.233221
2019-12-18 1.753858 0.266110 0.6165689271991402
?
通過整數(shù)下標(biāo)進行選擇
print(df.iloc[3],'\n')print(df.iloc[3:5,0:2],'\n')print(df.iloc[[1,2,4],[0,2]],'\n')print(df.iloc[1:3,:],'\n')print(df.iat[1,1],'\n')
A 0.717372
B 0.233221
C 1.839167
D -1.535794
Name: 2019-12-17 00:00:00, dtype: float64 A B
2019-12-17 0.717372 0.233221
2019-12-18 1.753858 0.266110 A C
2019-12-15 0.483995 -1.772022
2019-12-16 0.366885 0.438302
2019-12-18 1.753858 -0.333931 A B C D
2019-12-15 0.483995 -1.251278 -1.772022 1.282650
2019-12-16 0.366885 -1.617911 0.438302 1.124135 -1.2512782747247186
?
通過布爾值下標(biāo)進行選擇
print(df[df.A >0],'\n')#可以用df.A來表示print(df[df >0],'\n')
df2 = df.copy()
df2['E']=['one','one','two','three','four','three']print(df2,'\n')print(df2[df2['E'].isin(['two','four'])])#用isin來過濾
A B C D
2019-12-14 0.616569 0.051401 1.121409 0.307267
2019-12-15 0.483995 -1.251278 -1.772022 1.282650
2019-12-16 0.366885 -1.617911 0.438302 1.124135
2019-12-17 0.717372 0.233221 1.839167 -1.535794
2019-12-18 1.753858 0.266110 -0.333931 0.281345
2019-12-19 1.048606 0.984771 -0.279106 0.592285 A B C D
2019-12-14 0.616569 0.051401 1.121409 0.307267
2019-12-15 0.483995 NaN NaN 1.282650
2019-12-16 0.366885 NaN 0.438302 1.124135
2019-12-17 0.717372 0.233221 1.839167 NaN
2019-12-18 1.753858 0.266110 NaN 0.281345
2019-12-19 1.048606 0.984771 NaN 0.592285 A B C D E
2019-12-14 0.616569 0.051401 1.121409 0.307267 one
2019-12-15 0.483995 -1.251278 -1.772022 1.282650 one
2019-12-16 0.366885 -1.617911 0.438302 1.124135 two
2019-12-17 0.717372 0.233221 1.839167 -1.535794 three
2019-12-18 1.753858 0.266110 -0.333931 0.281345 four
2019-12-19 1.048606 0.984771 -0.279106 0.592285 three A B C D E
2019-12-16 0.366885 -1.617911 0.438302 1.124135 two
2019-12-18 1.753858 0.266110 -0.333931 0.281345 four
df1 = df.reindex(index=dates[0:4], columns=list(df.columns)+['E'])
df1.loc[dates[0]:dates[1],'E']=1print(df1,'\n')print(df1.dropna(how='any'),'\n')#刪除所有存在缺失值的行print(df1.fillna(value=5),'\n')#將所有缺失值用5填充print(pd.isna(df1))#獲取所有位置是否是缺失值的boolean值
A B C D F E
2019-12-14 0.000000 0.000000 1.121409 5 NaN 1.0
2019-12-15 0.483995 -1.251278 -1.772022 5 1.0 1.0
2019-12-16 0.366885 -1.617911 0.438302 5 2.0 NaN
2019-12-17 0.717372 0.233221 1.839167 5 3.0 NaN A B C D F E
2019-12-15 0.483995 -1.251278 -1.772022 5 1.0 1.0 A B C D F E
2019-12-14 0.000000 0.000000 1.121409 5 5.0 1.0
2019-12-15 0.483995 -1.251278 -1.772022 5 1.0 1.0
2019-12-16 0.366885 -1.617911 0.438302 5 2.0 5.0
2019-12-17 0.717372 0.233221 1.839167 5 3.0 5.0 A B C D F E
2019-12-14 False False False False True False
2019-12-15 False False False False False False
2019-12-16 False False False False False True
2019-12-17 False False False False False True
4 操作
4.1 統(tǒng)計
通常,操作都會把 NaN 排除在外
print(df.mean(),'\n')print(df.mean(1),'\n')#對列求均值print(pd.Series([1,3,5, np.nan,6,8], index=dates).shift(2),'\n')#將所有數(shù)據(jù)下移兩位
A 0.728453
B -0.230848
C 0.168970
D 5.000000
F 3.000000
dtype: float64 2019-12-14 1.530352
2019-12-15 0.692139
2019-12-16 1.237455
2019-12-17 2.157952
2019-12-18 2.137207
2019-12-19 2.350854
Freq: D, dtype: float64 2019-12-14 NaN
2019-12-15 NaN
2019-12-16 1.0
2019-12-17 3.0
2019-12-18 5.0
2019-12-19 NaN
Freq: D, dtype: float64
?
4.2 Apply
對數(shù)據(jù)apply一個函數(shù)
print(df,'\n')print(df.apply(np.cumsum),'\n')#cumsum表示每行逐次累加print(df.apply(lambda x: x.max()-x.min()))#每一列的最大值減去最小值
A B C D F
2019-12-14 0.000000 0.000000 1.121409 5 NaN
2019-12-15 0.483995 -1.251278 -1.772022 5 1.0
2019-12-16 0.366885 -1.617911 0.438302 5 2.0
2019-12-17 0.717372 0.233221 1.839167 5 3.0
2019-12-18 1.753858 0.266110 -0.333931 5 4.0
2019-12-19 1.048606 0.984771 -0.279106 5 5.0 A B C D F
2019-12-14 0.000000 0.000000 1.121409 5 NaN
2019-12-15 0.483995 -1.251278 -0.650613 10 1.0
2019-12-16 0.850881 -2.869189 -0.212311 15 3.0
2019-12-17 1.568253 -2.635968 1.626856 20 6.0
2019-12-18 3.322111 -2.369858 1.292925 25 10.0
2019-12-19 4.370717 -1.385087 1.013819 30 15.0 A 1.753858
B 2.602682
C 3.611190
D 0.000000
F 4.000000
dtype: float64
s = pd.Series(['A','B','C','Aaba','Baca', np.nan,'CABA','dog','cat'])print(s.str.lower(),'\n')
0 a
1 b
2 c
3 aaba
4 baca
5 NaN
6 caba
7 dog
8 cat
dtype: object
df = pd.DataFrame({'A':['foo','bar','foo','bar','foo','bar','foo','foo'],'B':['one','one','two','three','two','two','one','three'],'C': np.random.randn(8),'D': np.random.randn(8)})print(df,'\n')print(df.groupby('A').sum(),'\n')print(df.groupby(['A','B']).sum())
A B C D
0 foo one 0.182951 -0.346016
1 bar one 1.906573 -0.386902
2 foo two 0.102443 -1.087078
3 bar three 1.023196 0.467237
4 foo two -0.940578 -0.414947
5 bar two -1.075379 -0.844441
6 foo one 0.514689 0.038393
7 foo three -2.136912 -1.001738 C D
A
bar 1.854390 -0.764105
foo -2.277406 -2.811385 C D
A B
bar one 1.906573 -0.386902three 1.023196 0.467237two -1.075379 -0.844441
foo one 0.697640 -0.307622three -2.136912 -1.001738two -0.838134 -1.502025
7 Reshaping
7.1 Stack層疊
tuples =list(zip(*[['bar','bar','baz','baz','foo','foo','qux','qux'],['one','two','one','two','one','two','one','two']]))print(tuples)
index = pd.MultiIndex.from_tuples(tuples, names=['first','second'])
df = pd.DataFrame(np.random.randn(8,2), index=index, columns=['A','B'])
df2 = df[:4]print(df2,'\n')
[('bar', 'one'), ('bar', 'two'), ('baz', 'one'), ('baz', 'two'), ('foo', 'one'), ('foo', 'two'), ('qux', 'one'), ('qux', 'two')]A B
first second
bar one 0.220077 1.659987two -1.504953 0.350697
baz one 1.631244 0.637820two -0.083256 0.370784
?
stacked = df2.stack()print(stacked,'\n')print(stacked.unstack(),'\n')print(stacked.unstack(0),'\n')print(stacked.unstack(1),'\n')
first second
bar one A 0.220077B 1.659987two A -1.504953B 0.350697
baz one A 1.631244B 0.637820two A -0.083256B 0.370784
dtype: float64 A B
first second
bar one 0.220077 1.659987two -1.504953 0.350697
baz one 1.631244 0.637820two -0.083256 0.370784 first bar baz
second
one A 0.220077 1.631244B 1.659987 0.637820
two A -1.504953 -0.083256B 0.350697 0.370784 second one two
first
bar A 0.220077 -1.504953B 1.659987 0.350697
baz A 1.631244 -0.083256B 0.637820 0.370784
df = pd.DataFrame({'A':['one','one','two','three']*3,'B':['A','B','C']*4,'C':['foo','foo','foo','bar','bar','bar']*2,'D': np.random.randn(12),'E': np.random.randn(12)})print(df,'\n')print(pd.pivot_table(df, values='D', index=['A','B'], columns=['C']),'\n')
A B C D E
0 one A foo -0.647116 -0.383797
1 one B foo -1.009740 -0.149929
2 two C foo -0.000333 0.233550
3 three A bar 0.308752 -0.204858
4 one B bar 0.157966 -0.066707
5 one C bar 1.028976 -1.697499
6 two A foo -0.170791 -0.161074
7 three B foo 1.222346 -0.465335
8 one C foo -0.024742 -1.984697
9 one A bar -1.253050 0.675345
10 two B bar -0.124737 -0.169243
11 three C bar 0.501580 1.049995 C bar foo
A B
one A -1.253050 -0.647116B 0.157966 -1.009740C 1.028976 -0.024742
three A 0.308752 NaNB NaN 1.222346C 0.501580 NaN
two A NaN -0.170791B -0.124737 NaNC NaN -0.000333
df = pd.DataFrame({"id":[1,2,3,4,5,6],"raw_grade":['a','b','b','a','a','e']})#將raw_grade轉(zhuǎn)化為categorical的數(shù)據(jù)類型
df["grade"]= df["raw_grade"].astype("category")print(df["grade"],'\n')
0 a
1 b
2 b
3 a
4 a
5 e
Name: grade, dtype: category
Categories (3, object): [a, b, e]
?
df["grade"].cat.categories =["very good","good","very bad"]print(df["grade"],'\n')#重新排序類別,同時添加缺少的類別;新類別按照舊類別的順序匹配
df["grade"]= df["grade"].cat.set_categories(["very bad","bad","medium","good","very good"])print(df["grade"],'\n')
0 very good
1 good
2 good
3 very good
4 very good
5 very bad
Name: grade, dtype: category
Categories (3, object): [very good, good, very bad] 0 very good
1 good
2 good
3 very good
4 very good
5 very bad
Name: grade, dtype: category
Categories (5, object): [very bad, bad, medium, good, very good]
?
#排序是按類別中的順序進行的,而不是詞匯順序。print(df.sort_values(by="grade"),'\n')print(df.groupby("grade").size())
id raw_grade grade
5 6 e very bad
1 2 b good
2 3 b good
0 1 a very good
3 4 a very good
4 5 a very good grade
very bad 1
bad 0
medium 0
good 2
very good 3
dtype: int64
if pd.Series([False,True,False]):print("I was true")
---------------------------------------------------------------------------ValueError Traceback (most recent call last)<ipython-input-210-5c782b38cd2f> in <module>
----> 1 if pd.Series([False, True, False]):2 print("I was true")D:\Applications\Anaconda3\lib\site-packages\pandas\core\generic.py in __nonzero__(self)1553 "The truth value of a {0} is ambiguous. "1554 "Use a.empty, a.bool(), a.item(), a.any() or a.all().".format(
-> 1555 self.__class__.__name__1556 )1557 )ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().