當(dāng)前位置：首頁(yè) > 编程资源 > 编程问答 >内容正文

编程问答

信用卡诈骗检测（经过测试）

發(fā)布時(shí)間：2023/12/20 编程问答 20 豆豆

生活随笔收集整理的這篇文章主要介紹了信用卡诈骗检测（经过测试）小編覺(jué)得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.

import pandas as pd import matplotlib.pyplot as plt import numpy as np data =pd.read_csv(r'') count_classes = pd.value_counts(data['Class'],sort=True).sort_index() count_classes.plot(kind='bar') plt.xlabel('class') plt.ylabel('frequency') """" 上面是對(duì)原始數(shù)據(jù)進(jìn)行展示，對(duì)于Class來(lái)說(shuō)，0代表數(shù)據(jù)正常，1代表數(shù)據(jù)出現(xiàn)異常 """ from sklearn.preprocessing import StandardScaler data['normAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1,1)) '''在數(shù)據(jù)中新建一個(gè)’nornAmount列，然后StandardScaler().fit_transform（）是對(duì)數(shù)據(jù)進(jìn)行特征標(biāo)準(zhǔn)化，然reshape（-1,1）是對(duì)數(shù)據(jù)寫(xiě)成一列''' data =data.drop(['Time','Amount'],axis=1) X =data.iloc[:,data.columns!='Class'] y =data.iloc[:,data.columns =='Class'] number_records_frand =len(data[data.Class==1]) frand_indices =np.array(data[data.Class==1].index) '''這個(gè)是異常樣本的索引的值''' normal_indices =np.array(data[data.Class==0].index) random_normal_indices = np.random.choice(normal_indices,number_records_frand,replace=False) '''由于要讓異常樣本和普通樣本數(shù)據(jù)量相同，所以在正常數(shù)據(jù)中獲取相同數(shù)量的正常樣本''' random_normal_indices = np.array(random_normal_indices) under_sample_indices =np.concatenate([frand_indices,random_normal_indices]) '''將兩個(gè)數(shù)據(jù)進(jìn)行合并，然后下面在搜尋這個(gè)索引所對(duì)應(yīng)的值''' under_sample_data = data.iloc[under_sample_indices,:] '''X為特征值，y為樣本''' X_undersample =under_sample_data.iloc[:,under_sample_data.columns!='Class'] y_undersample =under_sample_data.iloc[:,under_sample_data.columns == 'Class'] print("正常樣本所占整體比例",len(under_sample_data[under_sample_data.Class ==0])/len(under_sample_data)) print("異常樣本所占整體比例",len(under_sample_data[under_sample_data.Class ==1])/len(under_sample_data)) print('樣本總數(shù)',len(under_sample_data)) '''以下進(jìn)行數(shù)據(jù)分割''' from sklearn.model_selection import train_test_split X_train,X_test,y_train,y_test =train_test_split(X,y,test_size=0.3,random_state=0) '''這里對(duì)所有的數(shù)據(jù)進(jìn)行了切分，很重要！！！！！！''' print("原始訓(xùn)練集包含樣本數(shù)量",len(X_train)) print('原始測(cè)試集包含樣本數(shù)量',len(X_test)) print('原始樣本總數(shù)',len(X_train)+len(X_test)) X_train_undersample,X_test_undersample,y_train_undersample,y_test_undersample =train_test_split(X_undersample,y_undersample,test_size=0.3,random_state=0) print('下采集訓(xùn)練集包含樣本數(shù)量',len(X_train_undersample)) print('下采集測(cè)試集包含樣本數(shù)量',len(X_test_undersample)) print('下采集樣本總數(shù)',len(X_test_undersample)+len(X_train_undersample))from sklearn.model_selection import KFold '''進(jìn)行交叉驗(yàn)證''' from sklearn.linear_model import LogisticRegression '''進(jìn)行邏輯線(xiàn)性回歸''' from sklearn.metrics import recall_score '''進(jìn)行計(jì)算召回值'''def printing_Kfold_scores(x_train_data, y_train_data):# 導(dǎo)入 KFold的方式不同引起# from sklearn.cross_validation import KFold# fold = KFold(len(y_train_data),5,shuffle=False)'''表示劃分為5塊'''# from sklearn.model_selection import KFoldfold = KFold(5, shuffle=False)# 定義不同力度的正則化懲罰力度c_param_range = [0.01, 0.1, 1, 10, 100]# 展示結(jié)果用的表格results_table = pd.DataFrame(index=range(len(c_param_range), 2), columns=['C_parameter', 'Mean recall score'])results_table['C_parameter'] = c_param_range# k-fold 表示K折的交叉驗(yàn)證，這里會(huì)得到兩個(gè)索引集合: 訓(xùn)練集 = indices[0], 驗(yàn)證集 = indices[1]j = 0# 循環(huán)遍歷不同的參數(shù)for c_param in c_param_range:print('-------------------------------------------')print('正則化懲罰力度: ', c_param)print('-------------------------------------------')print('')recall_accs = []# 一步步分解來(lái)執(zhí)行交叉驗(yàn)證''''enumerate()是將數(shù)據(jù)自動(dòng)添加索引''''''這里是將交叉驗(yàn)證后的索引記錄下來(lái)'''for iteration, indices in enumerate(fold.split(x_train_data)):# 指定算法模型，并且給定參數(shù)# lr = LogisticRegression(C = c_param, penalty = 'l1')lr = LogisticRegression(C=c_param, penalty='l1', solver='liblinear')# 訓(xùn)練模型，注意索引不要給錯(cuò)了，訓(xùn)練的時(shí)候一定傳入的是訓(xùn)練集，所以X和Y的索引都是0lr.fit(x_train_data.iloc[indices[0], :], y_train_data.iloc[indices[0], :].values.ravel())# 建立好模型后，預(yù)測(cè)模型結(jié)果，這里用的就是驗(yàn)證集，索引為1y_pred_undersample = lr.predict(x_train_data.iloc[indices[1], :].values)# 有了預(yù)測(cè)結(jié)果之后就可以來(lái)進(jìn)行評(píng)估了，這里recall_score需要傳入預(yù)測(cè)值和真實(shí)值。recall_acc = recall_score(y_train_data.iloc[indices[1], :].values, y_pred_undersample)# 一會(huì)還要算平均，所以把每一步的結(jié)果都先保存起來(lái)。recall_accs.append(recall_acc)print('Iteration ', iteration, ': 召回率 = ', recall_acc)# 當(dāng)執(zhí)行完所有的交叉驗(yàn)證后，計(jì)算平均結(jié)果results_table.loc[j, 'Mean recall score'] = np.mean(recall_accs)j += 1print('')print('平均召回率 ', np.mean(recall_accs))print('')# 找到最好的參數(shù)，哪一個(gè)Recall高，自然就是最好的了。best_c = results_table.loc[results_table['Mean recall score'].astype('float64').idxmax()]['C_parameter']# 打印最好的結(jié)果print('*********************************************************************************')print('效果最好的模型所選參數(shù) = ', best_c)print('*********************************************************************************')return best_cbest_c =printing_Kfold_scores(X_train_undersample,y_train_undersample) def plot_confusion_matrix(cm,classes,title ='Confusion matrix',cmap=plt.cm.Blues):plt.imshow(cm,interpolation='nearest',cmap=cmap)plt.title(title)plt.colorbar()tick_marks=np.arange(len(classes))plt.xticks(tick_marks,classes,rotation=0)plt.yticks(tick_marks,classes)thresh =cm.max()/2for i,j in itertools.product(range(cm.shape[0]),range(cm.shape[1])):plt.text(j,i,cm[i,j],horizontalalignment='center',color='white' if cm[i,j]>thresh else 'black')plt.tight_layout()plt.ylabel('True label')plt.xlabel('Predict label') '''建立了混淆矩陣的模型''' import itertools from sklearn.metrics import confusion_matrix lr =LogisticRegression(C=best_c,penalty='l1', solver='liblinear') lr.fit(X_train_undersample,y_train_undersample.values.ravel()) y_pred_underexample=lr.predict(X_test_undersample.values) cnf_matrix = confusion_matrix(y_test_undersample,y_pred_underexample) np.set_printoptions(precision=2) print('召回值:',cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1])) class_names=[0,1] plt.figure() plot_confusion_matrix(cnf_matrix,class_names,title='Confusion matrix')'''使用原始數(shù)據(jù)的測(cè)試集進(jìn)行分析''' '''方法和上面一樣，只是改變了數(shù)據(jù)集''' lr =LogisticRegression(C=best_c,penalty='l1',solver='liblinear') lr.fit(X_train_undersample,y_train_undersample.values.ravel()) y_pred = lr.predict((X_test.values)) cnf_matrix =confusion_matrix(y_test,y_pred) np.set_printoptions(precision=2) print('召回率為：',cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1])) class_names=[0,1] plt.figure() plot_confusion_matrix(cnf_matrix,class_names,title='Confusion matrix') plt.show()lr = LogisticRegression(C =best_c,penalty='l1',solver='liblinear') lr.fit(X_train_undersample,y_train_undersample.values.ravel()) y_pred_underexample_proba =lr.predict_proba(X_test_undersample.values) threshold=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9] plt.figure(figsize=(10,10)) j=1 for i in threshold:y_test_predictions_high_recall =y_pred_underexample_proba[:,1]>iplt.subplot(3,3,j)j +=1cnf_matrix =confusion_matrix(y_test_undersample,y_test_predictions_high_recall)np.set_printoptions(precision=2)print("召回率是：",cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))class_names=[0,1]plot_confusion_matrix(cnf_matrix,classes =class_names,title='Threshold>=%s'%i)plt.show() import matplotlib.pyplot as plt import pandas as pd from imblearn.over_sampling import SMOTE from sklearn.metrics import confusion_matrix from sklearn.model_selection import train_test_split from sklearn.model_selection import KFold '''進(jìn)行交叉驗(yàn)證''' from sklearn.linear_model import LogisticRegression '''進(jìn)行邏輯線(xiàn)性回歸''' from sklearn.metrics import recall_score '''進(jìn)行計(jì)算召回值''' import numpy as np import itertools def printing_Kfold_scores(x_train_data, y_train_data):# 導(dǎo)入 KFold的方式不同引起# from sklearn.cross_validation import KFold# fold = KFold(len(y_train_data),5,shuffle=False)'''表示劃分為5塊'''# from sklearn.model_selection import KFoldfold = KFold(5, shuffle=False)# 定義不同力度的正則化懲罰力度c_param_range = [0.01, 0.1, 1, 10, 100]# 展示結(jié)果用的表格results_table = pd.DataFrame(index=range(len(c_param_range), 2), columns=['C_parameter', 'Mean recall score'])results_table['C_parameter'] = c_param_range# k-fold 表示K折的交叉驗(yàn)證，這里會(huì)得到兩個(gè)索引集合: 訓(xùn)練集 = indices[0], 驗(yàn)證集 = indices[1]j = 0# 循環(huán)遍歷不同的參數(shù)for c_param in c_param_range:print('-------------------------------------------')print('正則化懲罰力度: ', c_param)print('-------------------------------------------')print('')recall_accs = []# 一步步分解來(lái)執(zhí)行交叉驗(yàn)證''''enumerate()是將數(shù)據(jù)自動(dòng)添加索引''''''這里是將交叉驗(yàn)證后的索引記錄下來(lái)'''for iteration, indices in enumerate(fold.split(x_train_data)):# 指定算法模型，并且給定參數(shù)# lr = LogisticRegression(C = c_param, penalty = 'l1')lr = LogisticRegression(C=c_param, penalty='l1', solver='liblinear')# 訓(xùn)練模型，注意索引不要給錯(cuò)了，訓(xùn)練的時(shí)候一定傳入的是訓(xùn)練集，所以X和Y的索引都是0lr.fit(x_train_data.iloc[indices[0], :], y_train_data.iloc[indices[0], :].values.ravel())# 建立好模型后，預(yù)測(cè)模型結(jié)果，這里用的就是驗(yàn)證集，索引為1y_pred_undersample = lr.predict(x_train_data.iloc[indices[1], :].values)# 有了預(yù)測(cè)結(jié)果之后就可以來(lái)進(jìn)行評(píng)估了，這里recall_score需要傳入預(yù)測(cè)值和真實(shí)值。recall_acc = recall_score(y_train_data.iloc[indices[1], :].values, y_pred_undersample)# 一會(huì)還要算平均，所以把每一步的結(jié)果都先保存起來(lái)。recall_accs.append(recall_acc)print('Iteration ', iteration, ': 召回率 = ', recall_acc)# 當(dāng)執(zhí)行完所有的交叉驗(yàn)證后，計(jì)算平均結(jié)果results_table.loc[j, 'Mean recall score'] = np.mean(recall_accs)j += 1print('')print('平均召回率 ', np.mean(recall_accs))print('')# 找到最好的參數(shù)，哪一個(gè)Recall高，自然就是最好的了。best_c = results_table.loc[results_table['Mean recall score'].astype('float64').idxmax()]['C_parameter']# 打印最好的結(jié)果print('*********************************************************************************')print('效果最好的模型所選參數(shù) = ', best_c)print('*********************************************************************************')return best_c def plot_confusion_matrix(cm,classes,title ='Confusion matrix',cmap=plt.cm.Blues):plt.imshow(cm,interpolation='nearest',cmap=cmap)plt.title(title)plt.colorbar()tick_marks=np.arange(len(classes))plt.xticks(tick_marks,classes,rotation=0)plt.yticks(tick_marks,classes)thresh =cm.max()/2for i,j in itertools.product(range(cm.shape[0]),range(cm.shape[1])):plt.text(j,i,cm[i,j],horizontalalignment='center',color='white' if cm[i,j]>thresh else 'black')plt.tight_layout()plt.ylabel('True label')plt.xlabel('Predict label')data =pd.read_csv(r'') columns =data.columns features_columns = columns.delete(len(columns)-1) features =data[features_columns] labels =data['Class'] features_train,features_test,labels_train,labels_test =train_test_split(features,labels,test_size=0.3,random_state=0) oversampler=SMOTE(random_state=0) os_features,os_labels = oversampler.fit_resample(features_train,labels_train) print(len(os_labels[os_labels==1])) os_features=pd.DataFrame(os_features) os_labels=pd.DataFrame(os_labels) best_c =printing_Kfold_scores(os_features,os_labels) lr=LogisticRegression(C =best_c,penalty='l1',solver='liblinear') lr.fit(os_features,os_labels.values.ravel()) y_pred =lr.predict(features_test.values) cnf_matrix=confusion_matrix(labels_test,y_pred) np.set_printoptions(precision=2) print('召回率:',cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1])) class_name=[0,1] plt.figure() plot_confusion_matrix(cnf_matrix,classes=class_name,title='Confusion matrix') plt.show()

總結(jié)

以上是生活随笔為你收集整理的信用卡诈骗检测（经过测试）的全部?jī)?nèi)容，希望文章能夠幫你解決所遇到的問(wèn)題。

如果覺(jué)得生活随笔網(wǎng)站內(nèi)容還不錯(cuò)，歡迎將生活随笔推薦給好友。