pandas基础知识
生活随笔
收集整理的這篇文章主要介紹了
pandas基础知识
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
一,創建series?
import pandas as pd countries = ['中國', '美國', '澳大利亞'] countries_s = pd.Series(countries) print(type(countries_s)) print(countries_s) print(countries_s.values)二,添加索引名
import pandas as pd country_dicts = {'CH': '中國','US': '美國','AU': '澳大利亞'}country_dict_s = pd.Series(country_dicts) # 給索引命名 country_dict_s.index.name = 'index' # 給數據命名 country_dict_s.name = 'Country'print(country_dict_s) print(country_dict_s.values) print(country_dict_s.index)三,pd.DataFrame
import pandas as pdcountry1 = pd.Series({'Name': '中國','Language': 'Chinese','Area': '9.597M km2','Happiness Rank': 79})country2 = pd.Series({'Name': '美國','Language': 'English (US)','Area': '9.834M km2','Happiness Rank': 14})country3 = pd.Series({'Name': '澳大利亞','Language': 'English (AU)','Area': '7.692M km2','Happiness Rank': 9})df = pd.DataFrame([country1, country2, country3]) print(df) print('df.values=',df.values) print(type(df['Area'])) print('area values=',df['Area'].values) print(df[['Name','Area']]) print(df[['Name','Area']].values)#操作原數據,要用copy,否則會改變原數據 rank=df['Happiness Rank'].values.copy() rank+=2 print(df['Happiness Rank'].values)四,增加一列:
country_1 = pd.Series({'Name': '中國','Language': '漢語','Area': '11111'}) country_2 = pd.Series({'Name': '美國','Language': '英語','Area': '222'}) country_3 = pd.Series({'Name': '澳大利亞','Language': '英語','Area': '333'}) # print(country_1)df=pd.DataFrame([country_1,country_2,country_3],index=['CH','US','AU']) print(df) # #增加一列 按列索引 df['location']='地球' print(df) df['region']=['亞洲','北美洲','大洋洲'] print(df)?五,轉置,刪除
country_1 = pd.Series({'Name': '中國','Language': '漢語','Area': '11111'}) country_2 = pd.Series({'Name': '美國','Language': '英語','Area': '222'}) country_3 = pd.Series({'Name': '澳大利亞','Language': '英語','Area': '333'}) # print(country_1)df=pd.DataFrame([country_1,country_2,country_3],index=['CH','US','AU']) print(df) #轉換行和列 print('====================================') print(df.T) #刪除數據 print('====================================') print(df.drop(['CH'])) print('====================================') print(df) #注意 drop操作不會改變原有數據的六,讀csv,index_col
import pandas as pd# 使用index_col指定索引列 # 使用usecols指定需要讀取的列 reprot_2016_df = pd.read_csv('./2016.csv',index_col='Country',usecols=['Country', 'Happiness Rank', 'Happiness Score', 'Region']) # 數據預覽 print(reprot_2016_df.head()) print(reprot_2016_df.values[:2,:])reprot_2016_df = pd.read_csv('./2016.csv',usecols=['Country', 'Happiness Rank', 'Happiness Score', 'Region']) # 數據預覽 print('==============================================') print(reprot_2016_df.head()) print(reprot_2016_df.values[:2,:])print('==============================================') print(reprot_2016_df[['Region','Happiness Rank']].values[:2,:])讀取csv的第二種方式
df_xc = pd.read_csv('../submit/submit_LF2551924C021_1007_xc.csv').copy() # 瑕疵結果print('len(df_xc)=',len(df_xc))newdict = {}for index, row in df_xc.iterrows():if index<1:name = '_'.join(row.filename.split('_')[2:6])print('===================')print('row')print(row)print('====================')print('name=',name)if name not in newdict.keys():newdict[name] = [row.probability]else:newdict[name].append(row.probability)break七,pd.query
from numpy.random import randn from pandas import DataFrame df = pd.DataFrame(randn(5, 2), columns=list('ab')) print(df) print(df.query('a > b')) print(df.query('a > 0.2'))八,列名重命名
import pandas as pdreprot_2016_df = pd.read_csv('./2016.csv',usecols=['Country', 'Happiness Rank', 'Happiness Score', 'Region']) # 數據預覽 print('==============================================') print(reprot_2016_df.head())reprot_2016_df.rename(columns={'Country': '國家','Region': '地區', 'Happiness Rank': '排名', 'Happiness Score': '幸福指數'},inplace=True) print('==============================================') print(reprot_2016_df.head())九,過濾
import pandas as pdreprot_2016_df = pd.read_csv('./2016.csv',usecols=['Country', 'Happiness Rank', 'Happiness Score', 'Region']) # 數據預覽 print('==============================================') print(reprot_2016_df.head())print('==============================================') df=reprot_2016_df[reprot_2016_df['Country'] == 'Denmark'] print(df.head())print('==============================================') only_western_europe_10 = reprot_2016_df[(reprot_2016_df['Region'] == 'Western Europe') & (reprot_2016_df['Happiness Rank'] > 10)] print(only_western_europe_10.head())十,處理Nan值
import pandas as pdlog_df = pd.read_csv('./data/log.csv') print(log_df.head()) print('===============查看head是否有空值=========================') #查看head是否有空值 print(log_df.head().isnull()) print('===============取出volume不為空的數據=========================') # 取出volume不為空的數據 print(log_df[log_df['volume'].notnull()]) #將index改為time和user log_df.set_index(['time', 'user'], inplace=True) print(log_df) #按照index排序 print('===============按照index排序=========================') log_df.sort_index(inplace=True) print(log_df) print('================將nan替換為0========================') #將nan替換為0 print(log_df.fillna(0)) print('================丟掉nan值========================') #丟掉nan值 print(log_df.dropna())十一,處理重復值
import pandas as pd data = pd.DataFrame({'k1': ['one', 'two'] * 2+ ['two'],'k2': [1, 3, 3, 4, 4]}) print(data) print('===============判斷是否重復=========================') print(data.duplicated()) print('===============去除重復數據=========================') print(data.drop_duplicates()) print('===============去除指定列的重復數據=========================') print(data.drop_duplicates(['k2']))十二,數據合并
import pandas as pdstaff_df = pd.DataFrame([{'姓名': '張三', '部門': '研發部'},{'姓名': '李四', '部門': '財務部'},{'姓名': '趙六', '部門': '市場部'}])student_df = pd.DataFrame([{'姓名': '張三', '專業': '計算機'},{'姓名': '李四', '專業': '會計'},{'姓名': '王五', '專業': '市場營銷'}])print(staff_df) print() print(student_df) print('===============數據合并有NAN==================') print(pd.merge(staff_df, student_df, how='outer', on='姓名')) print('===============數據合并無NAN==================') print(pd.merge(staff_df, student_df, how='inner', on='姓名'))十三,分箱操作
import pandas as pd# 年齡數據 ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32] # 分箱的邊界 bins = [18, 25, 35, 60, 100] cats = pd.cut(ages, bins) print(cats) print('================獲取分箱編碼================') print(cats.codes) print('===========統計箱中元素的個數=============') print(pd.value_counts(cats)) print('===========帶標簽的分箱=============') group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior'] cats = pd.cut(ages, bins, labels=group_names) print(cats)十四,畫圖
import pandas as pd import numpy as np import matplotlib.pyplot as plt np.random.seed(100) df = pd.DataFrame({'A': np.random.randn(365).cumsum(0),'B': np.random.randn(365).cumsum(0) + 20,'C': np.random.randn(365).cumsum(0) - 20},index=pd.date_range('2017/1/1', periods=365)) print(df.head()) df.plot() plt.show()df.plot('A', 'B', kind='scatter') plt.show()# 顏色(c)和大小(s)由'B'列的數據決定 ax = df.plot('A', 'B', kind='scatter',c='B', s=df['B'], colormap='viridis') # 設置坐標為相同比例 ax.set_aspect('equal') plt.show()df.plot(kind='box') plt.show()df.plot(kind='hist', alpha=0.7) df.plot(kind='kde') plt.show()十五,groupby
import pandas as pd df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'c'],'key2': ['one', 'two', 'one', 'two'],'data1':[1,2,3,4],'data2':[2,3,4,5]}) print(df) print('====================') grouped = df['data1'].groupby(df['key1']) print(grouped.mean()) print('====================') means = df['data1'].groupby([df['key1'], df['key2']]).mean() print(means) print('====================') print(df.groupby('key1').mean()) import pandas as pd import matplotlib.pyplot as plt import seaborn as sns countries = ['Germany', 'UK', 'CH', 'JP', 'Switzerland']data = pd.DataFrame({'InvoiceNo':['c12','24','34','3','4','5','6'],'price': [2,1,1,2,3,4,3],'quantity':[3,2,2,1,4,5,4],'country': ['UK','UK','UK', 'UK', 'CH', 'JP', 'CH']}) print(data) #只要我關心的國家 data=data[data['country'].isin(countries)].copy() #c開頭意味取消交易 cond1 = ~data['InvoiceNo'].str.startswith('c') cond2=data['country']!='UK' data2=data[cond1&cond2].copy() print('===============================================') print(data2) data2['total_cost']=data2['price']*data2['quantity'] print(data2) print('===============================================') cost_per_country=data2.groupby('country')['total_cost'].sum() print(cost_per_country) print('===============================================') print(cost_per_country.to_frame()) # 可視化結果 sns.barplot(data=cost_per_country.to_frame().T) # cost_per_country.sort_values(ascending=False).plot(kind='bar') plt.xticks(rotation=90) plt.xlabel('Country') plt.ylabel('costs') plt.tight_layout() plt.show()十六,apply用于每一列最小最大歸一化
import pandas as pd a=pd.Series({'v1':2,'v2':3}) b=pd.Series({'v1':5,'v2':10}) c=pd.Series({'v1':4,'v2':6}) all=pd.DataFrame([a,b,c]) def scale_minmax(col):return (col-col.min())/(col.max()-col.min()) print('================') print(all) all=all.apply(scale_minmax,axis=0) print('================') print(all) import pandas as pdImg1 = pd.Series({'ID': '1.jpg','Detection': '311 707 472 842'})Img2 = pd.Series({'ID': '2.jpg','Detection': '311 707 472 842'})Img3 = pd.Series({'ID': '3.jpg','Detection': '311 707 472 842'}) df = pd.DataFrame([Img1, Img2, Img3]) print('========================') print(df) print(df.iloc[:, 0])print('=========================') def pre_data(df):df.iloc[:, 0] = df.apply(lambda x: [float(a) for a in x[0].split(' ')], axis=1) pre_data(df) print(df)a='1 2 3 4' print([float(i) for i in a.split(' ')])十七,map,可用來制作類別型特征
示例1:
import pandas as pd x = pd.Series(['A', 'B', 'C'], index=['one', 'two', 'three']) y = {'A': 1, 'B': 2, 'C': 3} z=x.map(y) print(x) print(z)示例2 :
#produce res change 0 and 1df_yj['res'] = df_yj['probability'].map(lambda x: 0 if x < 0.2 else 1)# # 生成結果文件,保存在result文件夾中,可用于直接提交df_yj.to_csv(("../submit/LF2551924C021_1007_result_yj_0_1.csv"), index=False)十八,生成csv一
import pandas as pd c={} a=np.array([1]) b=np.array(['1 2 3 4']) c['ID']=a c['Detection']=b a_df=pd.DataFrame(c) a_df.to_csv('test16.csv',index=False,columns=['ID','Detection'])生成csv二
import pandas as pd a=np.array([1,2,3,4]) b=np.array([3,4,5,6]) a_df = pd.DataFrame(np.hstack([a.reshape(-1,1),b.reshape(-1,1)])) a_df.to_csv('1.csv',index=False,header=['a','b'])生成csv三
label_warp = {'normal': 0,'defect': 1} img_path=['a','b','c'] label=['normal','defect','normal'] label_file = pd.DataFrame({'img_path': img_path, 'label': label}) print(label_file) label_file=label_file['label'].map(label_warp) print(label_file)生成excel
df = pd.DataFrame(res) df.to_excel('./yunjiang_test3.xls', index=False, header=None)十九,給csv空的header增加header,注意在讀的時候沒有header要將其為None
csv_path = './train_only.csv' df = pd.read_csv(csv_path,header=None)######注意 print(df.shape) df_value=df.values # print(df_value[:-1,1]) # print(len(df_value[:,1]))df=pd.DataFrame(df_value,columns=['name','xmin','ymin','xmax','ymax','class']) df.to_csv('train_xml.csv',index=False)二十,loc,iloc,ix,loc——通過行標簽索引行數據,iloc——通過行號索引行數據,ix——通過行標簽或者行號索引行數據(基于loc和iloc 的混合)
import pandas as pddata = [[1, 2, 3], [4, 5, 6]] index = ['a', 'b'] # 行號 columns = ['c', 'd', 'e'] # 列號 df = pd.DataFrame(data, index=index, columns=columns) # 生成一個數據框 print(df) print('===============') #loc——通過行標簽索引行數據 print(df.loc['a']) #iloc——通過行號索引行數據 print('=================') print(df.iloc[0]) #ix——通過行標簽或者行號索引行數據(基于loc和iloc 的混合) print('=================') print(df.ix[0]) print(df.ix['a'])print('=================') print(df.loc[:, ['c']])print(df.iloc[:, [0]])二十一:value_counts()
可以用來統計每一類的個數
from sklearn.datasets import load_iris import matplotlib.pyplot as plt # load data iris = load_iris() df = pd.DataFrame(iris.data, columns=iris.feature_names) print('df.shape=',df.shape) df['label'] = iris.target print(df['label'].value_counts())二十二:pandas讀取csv的name
import numpy as np import pandas as pd names = np.array(pd.read_csv('./hunhe.csv', header=None))[:, 0] print(names)name_df=np.array(pd.read_csv('./hunhe.csv', header=None).values[:,0]).reshape(-1) print(name_df)二十三.pd.concat
import pandas as pd df1 = pd.DataFrame([['a', 1], ['b', 2]],columns = ['letter', 'number']) print(df1) df2 = pd.DataFrame([['c', 1], ['d', 2]], columns=['letter', 'number']) print(df2) df=pd.concat([df1,df2]) print(df)二十四.Categorical
import pandas as pd my_categories = pd.Categorical(['foo', 'bar', 'baz', 'foo', 'bar']) print('=====================') print(my_categories) #查看類別標簽 print('======================') print(my_categories.categories) # 查看類別編碼 print('======================') print(my_categories.codes)二十五.利用modin進行pandas加速
pip install modin[ray]
### Read in the data with Pandas import pandas as pds = time.time() df = pd.read_csv("esea_master_dmg_demos.part1.csv") e = time.time() print("Pandas Loading Time = {}".format(e-s))### Read in the data with Modin import modin.pandas as pds = time.time() df = pd.read_csv("esea_master_dmg_demos.part1.csv") e = time.time() print("Modin Loading Time = {}".format(e-s))二十六.取出csv的某一列類別值對應的數據
import numpy as np import pandas as pddef gini(nums):probs = [nums.count(i)/len(nums) for i in set(nums)]gini = sum([p*(1-p) for p in probs])return ginidef split_dataframe(data, col):'''function: split pandas dataframe to sub-df based on data and column.input: dataframe, column name.output: a dict of splited dataframe.'''# unique value of columnunique_values = data[col].unique()# print('==unique_values:', unique_values)# empty dict of dataframeresult_dict = {elem: pd.DataFrame for elem in unique_values}# split dataframe based on column valuefor key in result_dict.keys():result_dict[key] = data[:][data[col] == key]return result_dictdef test_split_dataframe():df = pd.read_csv('./example_data.csv', dtype={'windy': 'str'})res = split_dataframe(df, 'temp')print('=res:', res.keys())print("=====res['mild']:\n", res['mild']) if __name__ == '__main__':test_split_dataframe()excel數據(注意excel數據排版沒對齊):?
humility outlook temp windy play high sunny hot FALSE no high sunny hot TRUE no high overcast hot FALSE yes high rainy mild FALSE yes normal rainy cool FALSE yes normal rainy cool TRUE no normal overcast cool TRUE yes high sunny mild FALSE no normal sunny cool FALSE yes normal rainy mild FALSE yes normal sunny mild TRUE yes high overcast mild TRUE yes normal overcast hot FALSE yes high rainy mild TRUE no輸出結果:?
總結
以上是生活随笔為你收集整理的pandas基础知识的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: SIFT算法中概念简单解释
- 下一篇: CSAPP--信息的表示与处理