import numpy as np

import pandas as pd
from pandas import Series,DataFrame

import matplotlib.pyplot as plt
%matplotlib inline #不用再写show

from imblearn.over_sampling import SMOTE

credit = pd.read_csv('./creditcard.csv')
credit.shape

(284807, 31)

credit.head()

credit.isnull().any()  #判断空值的方法

Time      False
V1        False
V2        False
V3        False
V4        False
V5        False
V6        False
V7        False
V8        False
V9        False
V10       False
V11       False
V12       False
V13       False
V14       False
V15       False
V16       False
V17       False
V18       False
V19       False
V20       False
V21       False
V22       False
V23       False
V24       False
V25       False
V26       False
V27       False
V28       False
Amount    False
Class     False
dtype: bool

credit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
Time      284807 non-null float64
V1        284807 non-null float64
V2        284807 non-null float64
V3        284807 non-null float64
V4        284807 non-null float64
V5        284807 non-null float64
V6        284807 non-null float64
V7        284807 non-null float64
V8        284807 non-null float64
V9        284807 non-null float64
V10       284807 non-null float64
V11       284807 non-null float64
V12       284807 non-null float64
V13       284807 non-null float64
V14       284807 non-null float64
V15       284807 non-null float64
V16       284807 non-null float64
V17       284807 non-null float64
V18       284807 non-null float64
V19       284807 non-null float64
V20       284807 non-null float64
V21       284807 non-null float64
V22       284807 non-null float64
V23       284807 non-null float64
V24       284807 non-null float64
V25       284807 non-null float64
V26       284807 non-null float64
V27       284807 non-null float64
V28       284807 non-null float64
Amount    284807 non-null float64
Class     284807 non-null int64
dtypes: float64(30), int64(1)
memory usage: 67.4 MB

c_counts  = credit['Class'].value_counts()
c_counts

0    284315
1       492
Name: Class, dtype: int64

type(c_counts)

pandas.core.series.Series

plt.figure(figsize=(8,8))
# 饼图
ax = plt.subplot(1,2,1)
c_counts.plot(kind = 'pie',autopct = '%0.3f%%',ax = ax)

# 柱状图
ax = plt.subplot(1,2,2)
c_counts.plot(kind = 'bar',ax = ax)

<matplotlib.axes._subplots.AxesSubplot at 0x18f0a2ed940>

7200%3600

0

divmod(7201,3600)[0]

2

# map apply
credit['Time'] = credit['Time'].map(lambda x:divmod(x,3600)[0])
#map 只能处理series
#apply既能处理series,也能处理dataframe
#agg处理dataframe，仅仅针对聚合函数

credit['Time']

0          0.0
1          0.0
2          0.0
3          0.0
4          0.0
5          0.0
6          0.0
7          0.0
8          0.0
9          0.0
10         0.0
11         0.0
12         0.0
13         0.0
14         0.0
15         0.0
16         0.0
17         0.0
18         0.0
19         0.0
20         0.0
21         0.0
22         0.0
23         0.0
24         0.0
25         0.0
26         0.0
27         0.0
28         0.0
29         0.0
          ... 
284777    47.0
284778    47.0
284779    47.0
284780    47.0
284781    47.0
284782    47.0
284783    47.0
284784    47.0
284785    47.0
284786    47.0
284787    47.0
284788    47.0
284789    47.0
284790    47.0
284791    47.0
284792    47.0
284793    47.0
284794    47.0
284795    47.0
284796    47.0
284797    47.0
284798    47.0
284799    47.0
284800    47.0
284801    47.0
284802    47.0
284803    47.0
284804    47.0
284805    47.0
284806    47.0
Name: Time, Length: 284807, dtype: float64

# ! ! ! V1~V28

# 28万多条数据
cond0 = credit['Class'] == 0

# 492
cond1 = credit['Class'] == 1

# hist 直方图，柱状图


credit['V1'][cond0].plot(kind = 'hist',bins = 500)
credit['V1'][cond1].plot(kind = 'hist',bins = 50)

<matplotlib.axes._subplots.AxesSubplot at 0x18f0f3635c0>

credit.columns

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')

cols = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28']

drop_list = ['V1','V3','V5']

cond_0 = credit['Class'] == 0
cond_1 = credit['Class'] == 1
plt.figure(figsize=(12,28*6))

for i,col in enumerate(cols):
    ax = plt.subplot(28,1,i+1)
    
    credit[col][cond_0].plot(kind = 'hist',bins = 500,normed = True,ax = ax)
    credit[col][cond_1].plot(kind = 'hist',bins = 50,normed = True,ax = ax)
    
    ax.set_title(col)

C:\Users\softpo.DESKTOP-PN692CT\Anaconda3\lib\site-packages\matplotlib\axes\_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg.
  warnings.warn("The 'normed' kwarg is deprecated, and has been "

drops = ['V13','V15','V20','V22','V23','V24','V25','V26','V27','V28']

credit2 = credit.drop(labels=drops,axis = 1)

credit.shape

(284807, 31)

credit2.shape

(284807, 21)

from sklearn.preprocessing import StandardScaler

credit2['Amount'].max()

25691.16

credit2['Amount'].min()

0.0

credit2['V8'].max()

20.0072083651213

credit2['V8'].min()

-73.21671845526741

standScaler = StandardScaler() 

cols = ['Time','Amount']

credit2[cols] = standScaler.fit_transform(credit2[cols])

credit2['Amount'].max()

102.36224270928423

credit2['Amount'].min()

-0.35322939296682354

credit2['Time'].min()

-1.9602638886856412

credit2['Time'].max()

1.6044448928637376

from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier()

X_train = credit2.iloc[:,:-1]

y_train = credit2['Class']
clf.fit(X_train,y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

X_train.shape

(284807, 20)

feature_importances_ = clf.feature_importances_
feature_importances_

array([2.13317223e-03, 0.00000000e+00, 2.34971456e-02, 3.24365427e-02,
       0.00000000e+00, 0.00000000e+00, 3.95172432e-03, 0.00000000e+00,
       0.00000000e+00, 9.08812387e-02, 8.30824308e-03, 0.00000000e+00,
       2.64104332e-02, 7.28912511e-02, 2.58168549e-03, 1.35408766e-02,
       7.23309518e-01, 0.00000000e+00, 5.81693232e-05, 0.00000000e+00])

cols = X_train.columns
cols

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V14', 'V16', 'V17', 'V18', 'V19', 'V21', 'Amount'],
      dtype='object')

# 从大到小进行排列
index = feature_importances_.argsort()[::-1]
index

array([16,  9, 13,  3, 12,  2, 15, 10,  6, 14,  0, 18,  8,  7,  5,  4, 11,
       17,  1, 19], dtype=int64)

len(index)

20

plt.figure(figsize=(12,9))
plt.bar(np.arange(len(index)),feature_importances_[index])

_ = plt.xticks(np.arange(len(index)),cols[index])

drops = ['V7','V21','V8','V5','V4','V11','V19','V1','Amount']

credit3 = credit2.drop(labels=drops,axis = 1)
credit3.shape

(284807, 12)

from sklearn.model_selection import train_test_split

X = credit3.iloc[:,:-1]

y = credit3['Class']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3)

# X_train,y_train 作为训练数据

# 训练时，保证样本均衡，将X_train和y_train样本
# 测试时候，样本不均衡，没问题的

y_train.value_counts()

0    199019
1       345
Name: Class, dtype: int64

smote = SMOTE()

# ndarray
X_train_new,y_train_new = smote.fit_resample(X_train,y_train)

type(X_train_new)

numpy.ndarray

type(y_train_new)

numpy.ndarray

y_train_new

array([0, 0, 0, ..., 1, 1, 1], dtype=int64)

y_train_new.shape

(398038,)

y_train_new

array([0, 0, 0, ..., 1, 1, 1], dtype=int64)

Series(y_train_new).value_counts()

1    199019
0    199019
dtype: int64

# for 循环
import itertools

# 画图方法
# 绘制真实值和预测值对比情况
def plot_confusion_matrix(cm, classes,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)

    threshold = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > threshold else "black")#若对应格子上面的数量不超过阈值则，上面的字体为白色，为了方便查看

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

from sklearn.linear_model import LogisticRegression

logistic = LogisticRegression()

# 样本均衡的数据进行训练
logistic.fit(X_train_new,y_train_new)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

y_ = logistic.predict(X_test)

# 交叉表
pd.crosstab(y_test,y_)

#正例精确度
83648 / (83648 + 27)

0.9996773229757993

#正例召回率
83648 / (83648 + 1648)

0.9806790470830988

#负例的精确度
120 / (120 + 1648)

0.06787330316742081

#负例的召回率
120 / (120 + 27)

0.8163265306122449

#盗刷数据正确率和召回率


from sklearn.metrics import confusion_matrix

# 混合矩阵
cm = confusion_matrix(y_test,y_)
cm

array([[83648,  1648],
       [   27,   120]], dtype=int64)

# Recall------“正确被检索的正样本item(TP)"占所有"应该检索到的item(TP+FN)"的比例

plot_confusion_matrix(cm,[0,1],title='Recall:%0.3f'%(cm[1,1]/(cm[1,0] + cm[1,1])))

from sklearn.model_selection import GridSearchCV

logistic = LogisticRegression()
clf = GridSearchCV(logistic,param_grid={'tol':[1e-3,1e-4,1e-5],'C':[1,0.1,10,100],'penalty':['l1','l2']},cv = 10,iid = False,n_jobs=-1)
clf.fit(X_train_new,y_train_new)

GridSearchCV(cv=10, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=False, n_jobs=-1,
       param_grid={'tol': [0.001, 0.0001, 1e-05], 'C': [1, 0.1, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

clf.best_score_

0.9477637813475095

clf.best_score_

0.9478115174030656

clf.best_params_

{'C': 0.1, 'tol': 0.0001}

y3_ = clf.best_estimator_.predict(X_test)

confusion_matrix(y_test,y3_)

array([[83648,  1648],
       [   27,   120]], dtype=int64)

y2_ = clf.predict(X_test)

confusion_matrix(y_test,y2_)

array([[83648,  1648],
       [   27,   120]], dtype=int64)

y4_ = clf.predict(X_test)

cm2 = confusion_matrix(y_test,y4_)

plot_confusion_matrix(cm,[0,1],title='Logistic Recall:%0.3f'%(cm[1,1]/(cm[1,0] + cm[1,1])))

plot_confusion_matrix(cm2,[0,1],title='GridSearchCV Recall:%0.3f'%(cm2[1,1]/(cm2[1,0] + cm2[1,1])))

# 概率
#刷卡诈骗的容忍度？
y_proba_ = clf.predict_proba(X_test)
y_proba_

array([[0.93548127, 0.06451873],
       [0.98059225, 0.01940775],
       [0.82028904, 0.17971096],
       ...,
       [0.95243485, 0.04756515],
       [0.94441579, 0.05558421],
       [0.98251652, 0.01748348]])

from sklearn.metrics import auc,roc_curve

thresholds = [0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]

recalls = []

precissions = []

aucs = []

cms = []
for threshold in thresholds:
    y_ = y_proba_[:,1] >= threshold
    
    cm = confusion_matrix(y_test,y_)
    
    recalls.append(cm[1,1]/(cm[1,0] + cm[1,1]))
    
    precissions.append((cm[0,0] + cm[1,1])/cm.sum())
    
    fpr,tpr,_ = roc_curve(y_test,y_)
    
    auc_ = auc(fpr,tpr)
    
    aucs.append(auc_)
    
    cms.append(cm)

plt.figure(figsize=(24,18))
for i,cm in enumerate(cms):
    plt.subplot(3,4,i+1)
    plot_confusion_matrix(cm,[0,1],title='thresholds:%0.2f,Recall:%0.2f'%(thresholds[i],cm[1,1]/(cm[1,0] + cm[1,1])))

plt.figure(figsize=(12,6))

plt.plot(thresholds,recalls,label = 'Recall')

plt.plot(thresholds,precissions,label = 'Precission')

plt.plot(thresholds,aucs,label = 'auc')

plt.legend()

plt.xlabel('thresholds')

# plt.ylim(0.5,1.2)

Text(0.5,0,'thresholds')

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	...	V21	V22	V23	V24	V25	V26	V27	V28	Amount
0	0.0	-1.359807	-0.072781	2.536347	1.378155	-0.338321	0.462388	0.239599	0.098698	0.363787	...	-0.018307	0.277838	-0.110474	0.066928	0.128539	-0.189115	0.133558	-0.021053	149.62
1	0.0	1.191857	0.266151	0.166480	0.448154	0.060018	-0.082361	-0.078803	0.085102	-0.255425	...	-0.225775	-0.638672	0.101288	-0.339846	0.167170	0.125895	-0.008983	0.014724	2.69
2	1.0	-1.358354	-1.340163	1.773209	0.379780	-0.503198	1.800499	0.791461	0.247676	-1.514654	...	0.247998	0.771679	0.909412	-0.689281	-0.327642	-0.139097	-0.055353	-0.059752	378.66
3	1.0	-0.966272	-0.185226	1.792993	-0.863291	-0.010309	1.247203	0.237609	0.377436	-1.387024	...	-0.108300	0.005274	-0.190321	-1.175575	0.647376	-0.221929	0.062723	0.061458	123.50
4	2.0	-1.158233	0.877737	1.548718	0.403034	-0.407193	0.095921	0.592941	-0.270533	0.817739	...	-0.009431	0.798278	-0.137458	0.141267	-0.206010	0.502292	0.219422	0.215153	69.99

构建信用卡反欺诈预测模型¶

本项目需解决的问题¶

建模思路¶

项目背景¶

场景解析（算法选择）¶

1数据获取与解析¶

2特征工程¶

特征转换,将时间从单位每秒化为单位每小时¶

特征选择¶

特征缩放¶

对特征的重要性进行排序，以进一步减少变量¶

利用GBDT梯度提升决策树进行特征重要性排序¶

模型训练¶

SMOTE过采样¶

自定义可视化函数¶

单独的逻辑回归求得查全率Recall rate¶

Recall也叫召回率¶

利用GridSearchCV进行交叉验证和模型参数自动调优¶

预测¶

结果可视化¶

对比逻辑斯蒂回归和GridSearchCV结果¶

模型评估¶

考虑设置阈值，来调整预测被盗刷的概率，依次来调整模型的查全率（Recall）¶

趋势图¶

找出模型最优的阈值¶