Pandas的基本使用
點擊標題即可獲取文章源代碼和筆記
4.1.0 概要
Pandas基礎處理Pandas是什么?為什么用?核心數據結構DataFramePanelSeries基本操作運算畫圖文件的讀取與存儲高級處理
4. 1Pandas介紹
4.1 .1 Pandas介紹
- 數據處理工具panel
+ data
+ analysispanel面板數據
- 計量經濟學 三維數據
4.1 .2 為什么使用Pandas便捷的數據處理能力讀取文件方便封裝了Matplotlib、Numpy的畫圖和計算
4.1 .3 DataFrame結構:既有行索引,又有列索引的二維數組屬性:shapeindexcolumnsvaluesT方法:head
( ) tail
( ) 3 DataFrame索引的設置
1 )修改行列索引值
2 )重設索引
3 )設置新索引
2 PanelDataFrame的容器
3 Series帶索引的一維數組屬性indexvalues總結:DataFrame是Series的容器Panel是DataFrame的容器
4.2 基本數據操作
4.2 .1 索引操作
1 )直接索引先列后行
2 )按名字索引loc
3 )按數字索引iloc
4 )組合索引數字、名字
4.2 .3 排序對內容排序dataframeseries對索引排序dataframeseries
4.3 DataFrame運算算術運算邏輯運算邏輯運算符布爾索引邏輯運算函數query
( ) isin
( ) 統計運算
min max mean median var stdnp
. argmax
( ) np
. argmin
( ) 自定義運算
apply ( func
, axis
= 0 ) True func
: 自定義函數
4.4 Pandas畫圖sr
. plot
( )
4.5 文件讀取與存儲
4.5 .1 CSVpd
. read_csv
( path
) usecols
= names
= dataframe
. to_csv
( path
) columns
= [ ] index
= False header
= False 4.5 .2 HDF5hdf5 存儲
3 維數據的文件key1 dataframe1二維數據key2 dataframe2二維數據pd
. read_hdf
( path
, key
= ) df
. to_hdf
( path
, key
= ) 4.5 .3 JSONpd
. read_json
( path
) orient
= "records" lines
= True df
. to_json
( patn
) orient
= "records" lines
= True
4.1.3 DataFrame
import numpy
as np
stock_change
= np
. random
. normal
( 0 , 1 , ( 10 , 5 ) )
stock_change
array([[ 0.77072465, 1.30408183, -0.44043464, 0.8900768 , -0.80947118],[ 0.92407994, 0.01646795, -1.26614793, 1.52393669, -0.85373051],[-1.68378051, 0.4302981 , 0.8069393 , 0.60557427, -0.03960376],[ 0.75708007, -0.39899325, 0.23027082, -0.89585658, -1.86590247],[-0.41516245, -1.31841546, 0.16256478, -0.67449097, -1.26234013],[-0.27687242, -0.74154521, -0.03755446, 1.24182603, -0.79444361],[-0.2549323 , -0.41034663, -1.85076521, -1.28663451, -0.28566877],[ 1.22453612, -1.60200055, -1.83171522, -0.85322799, -1.70950421],[ 2.00461483, 1.49338564, 0.33928513, -0.1776084 , -0.39698965],[ 0.2184662 , -0.03868143, -0.21432675, 0.00604093, 1.35011139]])
import pandas
as pd
pd
. DataFrame
( stock_change
)
01234 00.770725 1.304082 -0.440435 0.890077 -0.809471 10.924080 0.016468 -1.266148 1.523937 -0.853731 2-1.683781 0.430298 0.806939 0.605574 -0.039604 30.757080 -0.398993 0.230271 -0.895857 -1.865902 4-0.415162 -1.318415 0.162565 -0.674491 -1.262340 5-0.276872 -0.741545 -0.037554 1.241826 -0.794444 6-0.254932 -0.410347 -1.850765 -1.286635 -0.285669 71.224536 -1.602001 -1.831715 -0.853228 -1.709504 82.004615 1.493386 0.339285 -0.177608 -0.396990 90.218466 -0.038681 -0.214327 0.006041 1.350111
stock_code
= [ '股票' + str ( i
) for i
in range ( stock_change
. shape
[ 0 ] ) ]
stock_code
['股票0', '股票1', '股票2', '股票3', '股票4', '股票5', '股票6', '股票7', '股票8', '股票9']
data
= pd
. DataFrame
( stock_change
, index
= stock_code
)
data
01234 股票00.770725 1.304082 -0.440435 0.890077 -0.809471 股票10.924080 0.016468 -1.266148 1.523937 -0.853731 股票2-1.683781 0.430298 0.806939 0.605574 -0.039604 股票30.757080 -0.398993 0.230271 -0.895857 -1.865902 股票4-0.415162 -1.318415 0.162565 -0.674491 -1.262340 股票5-0.276872 -0.741545 -0.037554 1.241826 -0.794444 股票6-0.254932 -0.410347 -1.850765 -1.286635 -0.285669 股票71.224536 -1.602001 -1.831715 -0.853228 -1.709504 股票82.004615 1.493386 0.339285 -0.177608 -0.396990 股票90.218466 -0.038681 -0.214327 0.006041 1.350111
date
= pd
. date_range
( start
= "20200618" , periods
= 5 , freq
= "B" )
date
DatetimeIndex(['2020-06-18', '2020-06-19', '2020-06-22', '2020-06-23','2020-06-24'],dtype='datetime64[ns]', freq='B')
data
= pd
. DataFrame
( stock_change
, index
= stock_code
, columns
= date
)
data
2020-06-182020-06-192020-06-222020-06-232020-06-24 股票00.770725 1.304082 -0.440435 0.890077 -0.809471 股票10.924080 0.016468 -1.266148 1.523937 -0.853731 股票2-1.683781 0.430298 0.806939 0.605574 -0.039604 股票30.757080 -0.398993 0.230271 -0.895857 -1.865902 股票4-0.415162 -1.318415 0.162565 -0.674491 -1.262340 股票5-0.276872 -0.741545 -0.037554 1.241826 -0.794444 股票6-0.254932 -0.410347 -1.850765 -1.286635 -0.285669 股票71.224536 -1.602001 -1.831715 -0.853228 -1.709504 股票82.004615 1.493386 0.339285 -0.177608 -0.396990 股票90.218466 -0.038681 -0.214327 0.006041 1.350111
DataFrame屬性
data
. shape
(10, 5)
data
. index
Index(['股票0', '股票1', '股票2', '股票3', '股票4', '股票5', '股票6', '股票7', '股票8', '股票9'], dtype='object')
data
. columns
DatetimeIndex(['2020-06-18', '2020-06-19', '2020-06-22', '2020-06-23','2020-06-24'],dtype='datetime64[ns]', freq='B')
data
. values
array([[ 0.77072465, 1.30408183, -0.44043464, 0.8900768 , -0.80947118],[ 0.92407994, 0.01646795, -1.26614793, 1.52393669, -0.85373051],[-1.68378051, 0.4302981 , 0.8069393 , 0.60557427, -0.03960376],[ 0.75708007, -0.39899325, 0.23027082, -0.89585658, -1.86590247],[-0.41516245, -1.31841546, 0.16256478, -0.67449097, -1.26234013],[-0.27687242, -0.74154521, -0.03755446, 1.24182603, -0.79444361],[-0.2549323 , -0.41034663, -1.85076521, -1.28663451, -0.28566877],[ 1.22453612, -1.60200055, -1.83171522, -0.85322799, -1.70950421],[ 2.00461483, 1.49338564, 0.33928513, -0.1776084 , -0.39698965],[ 0.2184662 , -0.03868143, -0.21432675, 0.00604093, 1.35011139]])
data
. T
股票0股票1股票2股票3股票4股票5股票6股票7股票8股票9 2020-06-180.770725 0.924080 -1.683781 0.757080 -0.415162 -0.276872 -0.254932 1.224536 2.004615 0.218466 2020-06-191.304082 0.016468 0.430298 -0.398993 -1.318415 -0.741545 -0.410347 -1.602001 1.493386 -0.038681 2020-06-22-0.440435 -1.266148 0.806939 0.230271 0.162565 -0.037554 -1.850765 -1.831715 0.339285 -0.214327 2020-06-230.890077 1.523937 0.605574 -0.895857 -0.674491 1.241826 -1.286635 -0.853228 -0.177608 0.006041 2020-06-24-0.809471 -0.853731 -0.039604 -1.865902 -1.262340 -0.794444 -0.285669 -1.709504 -0.396990 1.350111
DataFrame方法
data
. head
( )
2020-06-182020-06-192020-06-222020-06-232020-06-24 股票00.770725 1.304082 -0.440435 0.890077 -0.809471 股票10.924080 0.016468 -1.266148 1.523937 -0.853731 股票2-1.683781 0.430298 0.806939 0.605574 -0.039604 股票30.757080 -0.398993 0.230271 -0.895857 -1.865902 股票4-0.415162 -1.318415 0.162565 -0.674491 -1.262340
data
. tail
( )
2020-06-182020-06-192020-06-222020-06-232020-06-24 股票5-0.276872 -0.741545 -0.037554 1.241826 -0.794444 股票6-0.254932 -0.410347 -1.850765 -1.286635 -0.285669 股票71.224536 -1.602001 -1.831715 -0.853228 -1.709504 股票82.004615 1.493386 0.339285 -0.177608 -0.396990 股票90.218466 -0.038681 -0.214327 0.006041 1.350111
3 DataFrame索引的設置
data
. index
[ 2 ]
'股票2'
data
. index
[ 2 ] = "股票88"
---------------------------------------------------------------------------TypeError Traceback (most recent call last)<ipython-input-19-9e95917cc4d9> in <module>
----> 1 data.index[2] = "股票88"D:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in __setitem__(self, key, value)3908 3909 def __setitem__(self, key, value):
-> 3910 raise TypeError("Index does not support mutable operations")3911 3912 def __getitem__(self, key):TypeError: Index does not support mutable operations
stock_
= [ "股票_{}" . format ( i
) for i
in range ( 10 ) ]
data
. index
= stock_
data
. index
Index(['股票_0', '股票_1', '股票_2', '股票_3', '股票_4', '股票_5', '股票_6', '股票_7', '股票_8','股票_9'],dtype='object')
重設索引
reset_index(drop=False) 設置新的下標索引 drop:默認為False,不刪除原來索引,如果為True,刪除原來的索引值
data
. reset_index
( )
index2020-06-18 00:00:002020-06-19 00:00:002020-06-22 00:00:002020-06-23 00:00:002020-06-24 00:00:00 0股票_0 0.770725 1.304082 -0.440435 0.890077 -0.809471 1股票_1 0.924080 0.016468 -1.266148 1.523937 -0.853731 2股票_2 -1.683781 0.430298 0.806939 0.605574 -0.039604 3股票_3 0.757080 -0.398993 0.230271 -0.895857 -1.865902 4股票_4 -0.415162 -1.318415 0.162565 -0.674491 -1.262340 5股票_5 -0.276872 -0.741545 -0.037554 1.241826 -0.794444 6股票_6 -0.254932 -0.410347 -1.850765 -1.286635 -0.285669 7股票_7 1.224536 -1.602001 -1.831715 -0.853228 -1.709504 8股票_8 2.004615 1.493386 0.339285 -0.177608 -0.396990 9股票_9 0.218466 -0.038681 -0.214327 0.006041 1.350111
data
. reset_index
( drop
= True )
2020-06-182020-06-192020-06-222020-06-232020-06-24 00.770725 1.304082 -0.440435 0.890077 -0.809471 10.924080 0.016468 -1.266148 1.523937 -0.853731 2-1.683781 0.430298 0.806939 0.605574 -0.039604 30.757080 -0.398993 0.230271 -0.895857 -1.865902 4-0.415162 -1.318415 0.162565 -0.674491 -1.262340 5-0.276872 -0.741545 -0.037554 1.241826 -0.794444 6-0.254932 -0.410347 -1.850765 -1.286635 -0.285669 71.224536 -1.602001 -1.831715 -0.853228 -1.709504 82.004615 1.493386 0.339285 -0.177608 -0.396990 90.218466 -0.038681 -0.214327 0.006041 1.350111
以某列值設置為新的索引
set_index(keys,drop=True) keys:列索引名或者列索引名稱的列表 drop:boolean,default True 當作新的索引,刪除原來的索引列
設置新索引案例
df
= pd
. DataFrame
( { 'month' : [ 1 , 4 , 7 , 10 ] , 'year' : [ 2012 , 2014 , 2013 , 2014 ] , 'sale' : [ 55 , 40 , 84 , 31 ]
} )
df
monthyearsale 01 2012 55 14 2014 40 27 2013 84 310 2014 31
df
. set_index
( 'month' )
yearsale month 12012 55 42014 40 72013 84 102014 31
new_df
= df
. set_index
( [ 'year' , 'month' ] )
new_df
sale yearmonth 2012155 2014440 2013784 20141031
new_df
. index
MultiIndex([(2012, 1),(2014, 4),(2013, 7),(2014, 10)],names=['year', 'month'])
4.1.4 MultiIndex 與 Panel的關系
1 Multilndex多級或分層索引對象。
names: levels的名稱
levels:每個level的元組值
new_df
. index
. names
FrozenList(['year', 'month'])
new_df
. index
. levels
FrozenList([[2012, 2013, 2014], [1, 4, 7, 10]])
2 Panel
p
= pd
. Panel
( )
p
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version"""Entry point for launching an IPython kernel.<pandas.__getattr__.<locals>.Panel at 0x203fd31ea08>
data
2020-06-182020-06-192020-06-222020-06-232020-06-24 股票_00.770725 1.304082 -0.440435 0.890077 -0.809471 股票_10.924080 0.016468 -1.266148 1.523937 -0.853731 股票_2-1.683781 0.430298 0.806939 0.605574 -0.039604 股票_30.757080 -0.398993 0.230271 -0.895857 -1.865902 股票_4-0.415162 -1.318415 0.162565 -0.674491 -1.262340 股票_5-0.276872 -0.741545 -0.037554 1.241826 -0.794444 股票_6-0.254932 -0.410347 -1.850765 -1.286635 -0.285669 股票_71.224536 -1.602001 -1.831715 -0.853228 -1.709504 股票_82.004615 1.493386 0.339285 -0.177608 -0.396990 股票_90.218466 -0.038681 -0.214327 0.006041 1.350111
Series
data
. iloc
[ 1 , : ]
2020-06-18 0.924080
2020-06-19 0.016468
2020-06-22 -1.266148
2020-06-23 1.523937
2020-06-24 -0.853731
Freq: B, Name: 股票_1, dtype: float64
type ( data
. iloc
[ 1 , : ] )
pandas.core.series.Series
屬性
data
. iloc
[ 1 , : ] . index
DatetimeIndex(['2020-06-18', '2020-06-19', '2020-06-22', '2020-06-23','2020-06-24'],dtype='datetime64[ns]', freq='B')
data
. iloc
[ 1 , : ] . values
array([ 0.92407994, 0.01646795, -1.26614793, 1.52393669, -0.85373051])
1. 創建Series
通過已有數據創建
pd
. Series
( np
. arange
( 10 ) )
0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
dtype: int32
pd
. Series
( [ 6.7 , 5.6 , 3 , 10 , 2 ] , index
= [ 1 , 2 , 3 , 4 , 5 ] )
1 6.7
2 5.6
3 3.0
4 10.0
5 2.0
dtype: float64
pd
. Series
( { 'red' : 100 , 'blue' : 200 , 'green' : 500 , 'yellow' : 1000
} )
red 100
blue 200
green 500
yellow 1000
dtype: int64
總結
DataFrame 是 Series的容器 Panel 是 DataFrame的容器
4.2 基本數據操作
datas
= pd
. read_excel
( "./datas/szfj_baoan.xls" )
datas
districtroomnumhallAREAC_floorfloor_numschoolsubwayper_price 0baoan 3 2 89.3 middle 31 0 0 7.0773 1baoan 4 2 127.0 high 31 0 0 6.9291 2baoan 1 1 28.0 low 39 0 0 3.9286 3baoan 1 1 28.0 middle 30 0 0 3.3568 4baoan 2 2 78.0 middle 8 1 1 5.0769 ...... ... ... ... ... ... ... ... ... 1246baoan 4 2 89.3 low 8 0 0 4.2553 1247baoan 2 1 67.0 middle 30 0 0 3.8060 1248baoan 2 2 67.4 middle 29 1 0 5.3412 1249baoan 2 2 73.1 low 15 1 0 5.9508 1250baoan 3 2 86.2 middle 32 0 1 4.5244
1251 rows × 9 columns
datas
. columns
Index(['district', 'roomnum', 'hall', 'AREA', 'C_floor', 'floor_num', 'school','subway', 'per_price'],dtype='object')
datas
= datas
. drop
( columns
= [ 'school' , 'subway' , ] , axis
= 0 )
datas
districtroomnumhallAREAC_floorfloor_numper_price 0baoan 3 2 89.3 middle 31 7.0773 1baoan 4 2 127.0 high 31 6.9291 2baoan 1 1 28.0 low 39 3.9286 3baoan 1 1 28.0 middle 30 3.3568 4baoan 2 2 78.0 middle 8 5.0769 ...... ... ... ... ... ... ... 1246baoan 4 2 89.3 low 8 4.2553 1247baoan 2 1 67.0 middle 30 3.8060 1248baoan 2 2 67.4 middle 29 5.3412 1249baoan 2 2 73.1 low 15 5.9508 1250baoan 3 2 86.2 middle 32 4.5244
1251 rows × 7 columns
4.2.1 索引操作
1.直接使用行列索引(先列后行)
datas
[ "per_price" ] [ 0 ]
7.0773
2. 按名字索引(先行后列)
datas
. loc
[ 0 ] [ "per_price" ]
7.0773
datas
. loc
[ 0 , "per_price" ]
7.0773
3.按數字索引
datas
. iloc
[ 0 , 6 ]
7.0773
datas
. index
[ 0 : 4 ]
RangeIndex(start=0, stop=4, step=1)
datas
. loc
[ datas
. index
[ 0 : 4 ] , [ "district" , "roomnum" ] ]
districtroomnum 0baoan 3 1baoan 4 2baoan 1 3baoan 1
datas
. columns
. get_indexer
( [ "district" , "roomnum" ] )
array([0, 1], dtype=int64)
datas
. iloc
[ 0 : 4 , datas
. columns
. get_indexer
( [ "district" , "roomnum" ] ) ]
districtroomnum 0baoan 3 1baoan 4 2baoan 1 3baoan 1
4.2.2 賦值操作
datas
[ "hall" ] = 5
datas
. head
( )
districtroomnumhallAREAC_floorfloor_numper_price 0baoan 3 5 89.3 middle 31 7.0773 1baoan 4 5 127.0 high 31 6.9291 2baoan 1 5 28.0 low 39 3.9286 3baoan 1 5 28.0 middle 30 3.3568 4baoan 2 5 78.0 middle 8 5.0769
datas
. hall
= 1
datas
. head
( )
districtroomnumhallAREAC_floorfloor_numper_price 0baoan 3 1 89.3 middle 31 7.0773 1baoan 4 1 127.0 high 31 6.9291 2baoan 1 1 28.0 low 39 3.9286 3baoan 1 1 28.0 middle 30 3.3568 4baoan 2 1 78.0 middle 8 5.0769
datas
. iloc
[ 0 , 0 ] = "zzzz"
datas
. head
( )
districtroomnumhallAREAC_floorfloor_numper_price 0zzzz 3 1 89.3 middle 31 7.0773 1baoan 4 1 127.0 high 31 6.9291 2baoan 1 1 28.0 low 39 3.9286 3baoan 1 1 28.0 middle 30 3.3568 4baoan 2 1 78.0 middle 8 5.0769
4.2.3 排序
datas
. sort_values
( by
= "per_price" , ascending
= False )
districtroomnumhallAREAC_floorfloor_numper_price 917baoan 4 1 93.59 high 28 21.9040 356baoan 8 1 248.99 low 7 21.2860 576baoan 1 1 21.95 middle 22 19.3622 296baoan 4 1 93.59 high 28 19.2328 186baoan 3 1 113.60 middle 31 16.5493 ...... ... ... ... ... ... ... 911baoan 2 1 89.00 middle 16 1.6854 841baoan 2 1 75.00 high 7 1.6667 1188baoan 3 1 110.00 middle 33 1.5909 684baoan 3 1 89.00 middle 26 1.2247 1047baoan 3 1 98.90 middle 26 1.1931
1251 rows × 7 columns
datas
. sort_values
( by
= "per_price" )
districtroomnumhallAREAC_floorfloor_numper_price 1047baoan 3 1 98.90 middle 26 1.1931 684baoan 3 1 89.00 middle 26 1.2247 1188baoan 3 1 110.00 middle 33 1.5909 841baoan 2 1 75.00 high 7 1.6667 911baoan 2 1 89.00 middle 16 1.6854 ...... ... ... ... ... ... ... 186baoan 3 1 113.60 middle 31 16.5493 296baoan 4 1 93.59 high 28 19.2328 576baoan 1 1 21.95 middle 22 19.3622 356baoan 8 1 248.99 low 7 21.2860 917baoan 4 1 93.59 high 28 21.9040
1251 rows × 7 columns
datas
. sort_values
( by
= [ "district" , "per_price" ] )
districtroomnumhallAREAC_floorfloor_numper_price 1047baoan 3 1 98.90 middle 26 1.1931 684baoan 3 1 89.00 middle 26 1.2247 1188baoan 3 1 110.00 middle 33 1.5909 841baoan 2 1 75.00 high 7 1.6667 911baoan 2 1 89.00 middle 16 1.6854 ...... ... ... ... ... ... ... 296baoan 4 1 93.59 high 28 19.2328 576baoan 1 1 21.95 middle 22 19.3622 356baoan 8 1 248.99 low 7 21.2860 917baoan 4 1 93.59 high 28 21.9040 0zzzz 3 1 89.30 middle 31 7.0773
1251 rows × 7 columns
datas
. sort_index
( )
districtroomnumhallAREAC_floorfloor_numper_price 0zzzz 3 1 89.3 middle 31 7.0773 1baoan 4 1 127.0 high 31 6.9291 2baoan 1 1 28.0 low 39 3.9286 3baoan 1 1 28.0 middle 30 3.3568 4baoan 2 1 78.0 middle 8 5.0769 ...... ... ... ... ... ... ... 1246baoan 4 1 89.3 low 8 4.2553 1247baoan 2 1 67.0 middle 30 3.8060 1248baoan 2 1 67.4 middle 29 5.3412 1249baoan 2 1 73.1 low 15 5.9508 1250baoan 3 1 86.2 middle 32 4.5244
1251 rows × 7 columns
sr
= datas
[ "per_price" ]
sr
0 7.0773
1 6.9291
2 3.9286
3 3.3568
4 5.0769...
1246 4.2553
1247 3.8060
1248 5.3412
1249 5.9508
1250 4.5244
Name: per_price, Length: 1251, dtype: float64
sr
. sort_values
( )
1047 1.1931
684 1.2247
1188 1.5909
841 1.6667
911 1.6854...
186 16.5493
296 19.2328
576 19.3622
356 21.2860
917 21.9040
Name: per_price, Length: 1251, dtype: float64
sr
. sort_index
( )
0 7.0773
1 6.9291
2 3.9286
3 3.3568
4 5.0769...
1246 4.2553
1247 3.8060
1248 5.3412
1249 5.9508
1250 4.5244
Name: per_price, Length: 1251, dtype: float64
4.3 DataFrame運算
算術運算
datas
[ "roomnum" ] + 3
0 6
1 7
2 4
3 4
4 5..
1246 7
1247 5
1248 5
1249 5
1250 6
Name: roomnum, Length: 1251, dtype: int64
datas
[ "roomnum" ] . add
( 3 ) . head
( )
0 6
1 7
2 4
3 4
4 5
Name: roomnum, dtype: int64
datas
. iloc
[ : , 1 : 4 ]
roomnumhallAREA 03 1 89.3 14 1 127.0 21 1 28.0 31 1 28.0 42 1 78.0 ...... ... ... 12464 1 89.3 12472 1 67.0 12482 1 67.4 12492 1 73.1 12503 1 86.2
1251 rows × 3 columns
datas
. iloc
[ : , 1 : 4 ] + 10
roomnumhallAREA 013 11 99.3 114 11 137.0 211 11 38.0 311 11 38.0 412 11 88.0 ...... ... ... 124614 11 99.3 124712 11 77.0 124812 11 77.4 124912 11 83.1 125013 11 96.2
1251 rows × 3 columns
邏輯運算
datas
[ 'AREA' ] > 100
0 False
1 True
2 False
3 False
4 False...
1246 False
1247 False
1248 False
1249 False
1250 False
Name: AREA, Length: 1251, dtype: bool
datas
[ datas
[ 'AREA' ] > 100 ]
districtroomnumhallAREAC_floorfloor_numper_price 1baoan 4 1 127.00 high 31 6.9291 5baoan 4 1 125.17 middle 15 5.8161 16baoan 3 1 151.00 high 20 4.9669 25baoan 3 1 116.00 high 18 5.0000 26baoan 5 1 151.25 high 30 7.6033 ...... ... ... ... ... ... ... 1232baoan 5 1 127.17 low 24 5.1113 1238baoan 4 1 130.74 low 30 13.0029 1239baoan 3 1 102.10 middle 28 10.8717 1241baoan 5 1 151.30 high 29 7.2703 1243baoan 4 1 142.25 high 32 6.3269
322 rows × 7 columns
( datas
[ "AREA" ] > 100 ) & ( datas
[ "per_price" ] < 40000 )
0 False
1 True
2 False
3 False
4 False...
1246 False
1247 False
1248 False
1249 False
1250 False
Length: 1251, dtype: bool
datas
[ ( datas
[ "AREA" ] > 100 ) & ( datas
[ "per_price" ] < 40000 ) ]
districtroomnumhallAREAC_floorfloor_numper_price 1baoan 4 1 127.00 high 31 6.9291 5baoan 4 1 125.17 middle 15 5.8161 16baoan 3 1 151.00 high 20 4.9669 25baoan 3 1 116.00 high 18 5.0000 26baoan 5 1 151.25 high 30 7.6033 ...... ... ... ... ... ... ... 1232baoan 5 1 127.17 low 24 5.1113 1238baoan 4 1 130.74 low 30 13.0029 1239baoan 3 1 102.10 middle 28 10.8717 1241baoan 5 1 151.30 high 29 7.2703 1243baoan 4 1 142.25 high 32 6.3269
322 rows × 7 columns
邏輯運算函數
datas
. query
( "AREA>100 & per_price<40000" )
districtroomnumhallAREAC_floorfloor_numper_price 1baoan 4 1 127.00 high 31 6.9291 5baoan 4 1 125.17 middle 15 5.8161 16baoan 3 1 151.00 high 20 4.9669 25baoan 3 1 116.00 high 18 5.0000 26baoan 5 1 151.25 high 30 7.6033 ...... ... ... ... ... ... ... 1232baoan 5 1 127.17 low 24 5.1113 1238baoan 4 1 130.74 low 30 13.0029 1239baoan 3 1 102.10 middle 28 10.8717 1241baoan 5 1 151.30 high 29 7.2703 1243baoan 4 1 142.25 high 32 6.3269
322 rows × 7 columns
datas
[ "roomnum" ] . isin
( [ 4 , 5 ] )
0 False
1 True
2 False
3 False
4 False...
1246 True
1247 False
1248 False
1249 False
1250 False
Name: roomnum, Length: 1251, dtype: bool
datas
[ datas
[ "roomnum" ] . isin
( [ 4 , 5 ] ) ]
districtroomnumhallAREAC_floorfloor_numper_price 1baoan 4 1 127.00 high 31 6.9291 5baoan 4 1 125.17 middle 15 5.8161 26baoan 5 1 151.25 high 30 7.6033 29baoan 4 1 143.45 middle 25 6.9711 36baoan 4 1 134.60 middle 32 9.1828 ...... ... ... ... ... ... ... 1232baoan 5 1 127.17 low 24 5.1113 1238baoan 4 1 130.74 low 30 13.0029 1241baoan 5 1 151.30 high 29 7.2703 1243baoan 4 1 142.25 high 32 6.3269 1246baoan 4 1 89.30 low 8 4.2553
224 rows × 7 columns
統計運算
datas
. describe
( )
roomnumhallAREAfloor_numper_price count1251.000000 1251.0 1251.000000 1251.000000 1251.000000 mean2.906475 1.0 92.409976 24.598721 6.643429 std0.940663 0.0 37.798122 9.332119 2.435132 min1.000000 1.0 21.950000 1.000000 1.193100 25%2.000000 1.0 75.000000 17.000000 5.075850 50%3.000000 1.0 87.800000 28.000000 5.906800 75%3.000000 1.0 101.375000 31.000000 7.761950 max8.000000 1.0 352.900000 53.000000 21.904000
統計函數
datas
. max ( axis
= 0 )
district zzzz
roomnum 8
hall 1
AREA 352.9
C_floor middle
floor_num 53
per_price 21.904
dtype: object
datas
. var
( axis
= 0 )
roomnum 0.884846
hall 0.000000
AREA 1428.698032
floor_num 87.088446
per_price 5.929870
dtype: float64
datas
. std
( axis
= 0 )
roomnum 0.940663
hall 0.000000
AREA 37.798122
floor_num 9.332119
per_price 2.435132
dtype: float64
datas
. iloc
[ : , 3 ]
0 89.3
1 127.0
2 28.0
3 28.0
4 78.0...
1246 89.3
1247 67.0
1248 67.4
1249 73.1
1250 86.2
Name: AREA, Length: 1251, dtype: float64
datas
. iloc
[ : , 3 ] . idxmax
( axis
= 0 )
759
datas
. iloc
[ 759 , 3 ]
352.9
datas
. iloc
[ : , 3 ] . idxmin
( axis
= 0 )
576
datas
. iloc
[ 576 , 3 ]
21.95
累計統計函數
datas
[ "per_price" ]
0 7.0773
1 6.9291
2 3.9286
3 3.3568
4 5.0769...
1246 4.2553
1247 3.8060
1248 5.3412
1249 5.9508
1250 4.5244
Name: per_price, Length: 1251, dtype: float64
datas
[ "per_price" ] . cumsum
( )
0 7.0773
1 14.0064
2 17.9350
3 21.2918
4 26.3687...
1246 8291.3076
1247 8295.1136
1248 8300.4548
1249 8306.4056
1250 8310.9300
Name: per_price, Length: 1251, dtype: float64
datas
[ "per_price" ] . sort_index
( ) . cumsum
( ) . plot
( )
<matplotlib.axes._subplots.AxesSubplot at 0x2039a3a3dc8>
import matplotlib
. pyplot
as plt
datas
[ "per_price" ] . sort_index
( ) . cumsum
( ) . plot
( )
plt
. show
( )
自定義運算
datas
[ [ "per_price" ] ] . apply ( lambda x
: x
. max ( ) - x
. min ( ) , axis
= 0 )
per_price 20.7109
dtype: float64
4.4 Pandas畫圖
datas
. plot
( x
= "AREA" , y
= "per_price" , kind
= "scatter" )
<matplotlib.axes._subplots.AxesSubplot at 0x203a343dec8>
datas
. plot
( x
= "floor_num" , y
= "per_price" , kind
= "scatter" )
<matplotlib.axes._subplots.AxesSubplot at 0x203a3a81bc8>
datas
. plot
( x
= "AREA" , y
= "per_price" , kind
= "barh" )
<matplotlib.axes._subplots.AxesSubplot at 0x203a2147f08>
4.5 文件的讀取與存儲
1.讀取csv文件 read_csv()
iris_data
= pd
. read_csv
( "./datas/iris.data.csv" )
iris_data
. head
( )
feature1feature2feature3feature4result 05.1 3.5 1.4 0.2 Iris-setosa 14.9 3.0 1.4 0.2 Iris-setosa 24.7 3.2 1.3 0.2 Iris-setosa 34.6 3.1 1.5 0.2 Iris-setosa 45.0 3.6 1.4 0.2 Iris-setosa
iris_data1
= pd
. read_csv
( "./datas/iris.data.csv" , usecols
= [ "feature1" , "feature2" , "result" ] )
iris_data1
. head
( )
feature1feature2result 05.1 3.5 Iris-setosa 14.9 3.0 Iris-setosa 24.7 3.2 Iris-setosa 34.6 3.1 Iris-setosa 45.0 3.6 Iris-setosa
iris_data2
= pd
. read_csv
( "./datas/iris.data2.csv" )
iris_data2
. head
( )
5.13.51.40.2Iris-setosa 04.9 3.0 1.4 0.2 Iris-setosa 14.7 3.2 1.3 0.2 Iris-setosa 24.6 3.1 1.5 0.2 Iris-setosa 35.0 3.6 1.4 0.2 Iris-setosa 45.4 3.9 1.7 0.4 Iris-setosa
iris_data2
= pd
. read_csv
( "./datas/iris.data2.csv" , names
= [ "feature1" , "feature2" , "feature3" , "feature4" , "result" ] )
iris_data2
. head
( )
feature1feature2feature3feature4result 05.1 3.5 1.4 0.2 Iris-setosa 14.9 3.0 1.4 0.2 Iris-setosa 24.7 3.2 1.3 0.2 Iris-setosa 34.6 3.1 1.5 0.2 Iris-setosa 45.0 3.6 1.4 0.2 Iris-setosa
datas
. head
( 5 )
districtroomnumhallAREAC_floorfloor_numper_price 0zzzz 3 1 89.3 middle 31 7.0773 1baoan 4 1 127.0 high 31 6.9291 2baoan 1 1 28.0 low 39 3.9286 3baoan 1 1 28.0 middle 30 3.3568 4baoan 2 1 78.0 middle 8 5.0769
datas
[ : - 1 ] . to_csv
( "./price_test" , columns
= [ 'per_price' ] , index
= False , mode
= "a" , header
= False )
perice_test
= pd
. read_csv
( "./price_test" )
perice_test
per_price 07.0773 16.9291 23.9286 33.3568 45.0769 ...... 37466.1932 37474.2553 37483.806 37495.3412 37505.9508
3751 rows × 1 columns
總結
以上是生活随笔 為你收集整理的八、Pandas的基本使用 的全部內容,希望文章能夠幫你解決所遇到的問題。
如果覺得生活随笔 網站內容還不錯,歡迎將生活随笔 推薦給好友。