當前位置：首頁 > 编程资源 > 编程问答 >内容正文

编程问答

数据分析学习小结记录

發布時間：2024/1/8 编程问答 19 豆豆

生活随笔收集整理的這篇文章主要介紹了数据分析学习小结记录小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

------------Mysql獲取數據------------

# pymysql.cursors.DictCursor返回的數據集是包含數據表字段名的字典{}；
# [{'area': '上\u3000海'),.......]

# coding: utf-8import pymysql as mysql import matplotlib.pyplot as pltfrom pandas import DataFrame from pymysql import cursorsplt.rcParams['font.sans-serif'] = ['Microsoft YaHei'] # 指定默認字體，正常顯示中文標簽 plt.rcParams['axes.unicode_minus'] = False # 正常顯示負號 # 1、鏈接mysql數據庫，從歷年人口變化表(history_count), 按年齡人口統計表(age_count)中取出數據（2分鐘） # 1-1 連接數據庫 conn = mysql.connect(host='192.168.0.3', port=3306, user='jiker-coding', passwd='jiker-coding-Pwd-1234',db='coding', charset='utf8',cursorclass=cursors.DictCursor)# 1-2 獲取歷年人口變化 table = '`coding`.`history_count`' columns = ['year', 'total', 'man', 'woman', 'city', 'village', ] sql = "select %s from %s order by year" % (', '.join(["`%s`" % x for x in columns]), table) cursor = conn.cursor() cursor.execute(sql) history_count = cursor.fetchall()# 1-3 獲取按年齡人口統計 table = '`coding`.`age_count`' columns = ['area', 'age0_14', 'age15_59', 'age60', 'age65', ] sql = "select %s from %s order by area" % (', '.join(["`%s`" % x for x in columns]), table) cursor = conn.cursor() cursor.execute(sql) age_count = cursor.fetchall()cursor.close() conn.close()print("[任務1 :: 獲取歷年人口變化] =", len(history_count), " [按年齡人口統計] =", len(age_count), "\n\n")

【項目一】折線圖、餅圖

------------Pandas處理數據------------

df_history_count = DataFrame(history_count) df_age_count = DataFrame(age_count)# 打印數據為，DataFrame是一個表格型的數據結構，DataFrame即有行索引也有列索引 #? ?area ? ? ?age0_14 age15_59 ?age60 ?age65 #0 ? 上　海 ? ?9.80 ? ?66.82 ?23.38 ?16.28 #1 ? 云　南 ? 19.57 ? ?65.52 ?14.91 ?10.75# 按年齡統計人口數量 df_age_count_group = df_age_count.drop('area', axis=1, inplace=False) ?# 去掉area列 df_age_count_group['age60_65'] = df_age_count_group['age60'] - df_age_count_group['age65'] ?# 求出60-65歲占比 df_age_count_group = df_age_count_group.mean() ?# 按列求平均 del df_age_count_group['age60']

# 刪除列（方法1），增加列，求平均值，刪除列（方法2）
# 輸出折線圖
df_history_count
# 輸出餅圖
df_age_count_group

【項目二】柱狀圖

# 2、統計各個發卡機構交易成功的筆數和金額并篩選出成功交易筆數排名前10的發卡機構（3分鐘）

df_trade_infos = DataFrame(trade_infos) result = df_trade_infos.groupby('ISS_INS_ID_CD').agg({'TRANS_ST': 'count', 'TRANS_AT': 'sum'}) result = result.sort_values(['TRANS_ST'], ascending=False).head(10)

# ISS_INS_ID_CD?? ?發卡機構
# TRANS_ST?? ?交易狀態,[TRANS_ST]=1 為交易成功
# TRANS_AT?? ?交易金額

# 輸出柱狀圖
# 交易成功且筆數發卡機構和交易金額排名前10
? ? ? ? ? ? ? ? TRANS_ST ? ? TRANS_AT
ISS_INS_ID_CD ? ? ? ? ? ? ? ? ? ? ??
801020000 ? ? ? ? 45369 ?10567549022
801059999 ? ? ? ? 24372 ? 4021232077

【項目四】柱狀圖

# 2、統計出不同年齡、不同學歷金融借貸逾期概率（3分鐘）df_bad_loan_infos = DataFrame(bad_loan_infos) total_num = len(df_bad_loan_infos)# 過濾逾期信息 overtime_df = df_bad_loan_infos[df_bad_loan_infos['past_due_days'] > 0]# 按年齡打標簽 overtime_df.loc[(overtime_df.age >= 18) & (overtime_df.age <= 25), 'AgeGroup'] = '18-25' overtime_df.loc[(overtime_df.age >= 26) & (overtime_df.age <= 30), 'AgeGroup'] = '26-30' overtime_df.loc[(overtime_df.age >= 31) & (overtime_df.age <= 40), 'AgeGroup'] = '31-40' overtime_df.loc[(overtime_df.age >= 41), 'AgeGroup'] = '41+'# 按年齡分組統計 age = overtime_df[['AgeGroup', 'loan_id']].groupby('AgeGroup').agg('count')# 按年齡計算逾期概率 ratio_18_25 = round(age.at['18-25', 'loan_id'] / total_num * 100.0, 2) ratio_26_30 = round(age.at['26-30', 'loan_id'] / total_num * 100.0, 2) ratio_31_40 = round(age.at['31-40', 'loan_id'] / total_num * 100.0, 2) ratio_41 = round(age.at['41+', 'loan_id'] / total_num * 100.0, 2)

# 輸出柱狀圖

# 數據篩選 data[data["past_due_days"]>0]
# 打標簽 data.loc[(data.age >=18) & data.age <= 25,"AgeGroup"] = "18-25"
# 分組統計 data[["AgeGroup"，"loan_id"]].groupby("AgeGroup").agg("count")
# 分組計算比例 round(age)

【項目五】 K線圖

# 2、統計出大陽線的位置（漲幅大于1.5%為大陽線）（3分鐘）df_klines = DataFrame(stock_klines) df_klines['big_yang'] = 0 # 大陽線(初始化) df_klines['wave_range'] = 0 # 漲跌幅，漲跌幅=(今日收盤價-昨日收盤價)/昨日收盤價*100%(初始化)# 修復第一根k線數據，昨日收盤價為0 df_klines.loc[df_klines['yclose']==0, 'yclose'] = df_klines.loc[df_klines['yclose']==0]['close'] # 漲跌幅計算 df_klines['wave_range'] = (df_klines['close'] - df_klines['yclose']) / df_klines['yclose'] * decimal.Decimal(100.0)# 漲跌幅>1.5%為大陽線 df_klines.loc[(df_klines['wave_range']>=1.5) & (df_klines['close']>df_klines['open']), 'big_yang'] = 1

# 讀題目，理解題目

【項目七】柱狀圖

# 2、整理房價走勢數據，計算出房價增長率，按房價增長率降序排列（3分鐘）

df_house = DataFrame(house_prices) df_house['wave_range'] = pd.to_numeric((df_house['end_price'] - df_house['start_price']) / df_house['start_price'] * decimal.Decimal(100.0), errors='coerce') df_house = df_house.sort_values(by=['wave_range'], ascending=[False]) df_house = df_house.round({'wave_range': 2})print("[任務2：熱門城市房價漲跌幅]") print(df_house[['city', 'wave_range']], "\n\n")

# start_price 凈值日期
# end_price 累計凈值

# to_numeric 到數字，round(num,2)
# decimal.Decimal 十進制十進制
# coerce 強迫，迫使
# sort_values 排序

總結

以上是生活随笔為你收集整理的数据分析学习小结记录的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇：从QQ音乐开发，探讨如何利用腾讯云SDK
下一篇： c语言学习周报（2020.11.21-1

编程问答

数据分析 学习小结记录

總結

数据分析学习小结记录