# -*- coding: utf-8 -*-
"""
Created on Thu Sep 10 17:07:47 2020
@author: 谢树鹏
"""
import pandas as pd
import matplotlib.pyplot as plt
#%matplotlib inline
import seaborn as sns
import warnings
import matplotlib
warnings.filterwarnings('ignore') #警告过滤器
pd.set_option('display.max_columns',None)
train_data= pd.read_csv(r'E:\文档\我的文档\大数据\题目和模板\Train.csv')
train_data.rename(columns={'PAY_0':'PAY_1'},inplace= True)
matplotlib.rcParams['font.sans-serif'] = ['KaiTi']
matplotlib.rcParams['font.serif'] = ['KaiTi']
sns.set_style("darkgrid",{"font.sans-serif":['KaiTi', 'Arial']})
matplotlib.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题,或者转换负号为字符串
#对LIMIT_BAL(信用额度)进行Bining化后再进行Factorize化
bins = [1,5,20,35,50,100]#如何分级比较好?
train_data['LIMIT_BAL_bins'] = pd.cut(train_data['LIMIT_BAL'] / 10000,bins)
by_bins = train_data.groupby('LIMIT_BAL_bins')['y'].mean()
#print(by_bins)
train_data['LIMIT_BAL_bins_id'] = pd.factorize(train_data['LIMIT_BAL_bins'])[0]
train_data['LIMIT_BAL_bins_id'].unique()
#对AGE进行scaling
from sklearn import preprocessing
scaler = preprocessing.StandardScaler()
train_data['AGE'] = scaler.fit_transform(train_data['AGE'].values.reshape(-1,1))
# train_data
#对BILL_AMT1到BILL_AMT6,PAY_AMT1到PAY_AMT6进行Scaling
train_data['BILL_AMT1'] = scaler.fit_transform(train_data['BILL_AMT1'].values.reshape(-1,1))
train_data['BILL_AMT2'] = scaler.fit_transform(train_data['BILL_AMT2'].values.reshape(-1,1))
train_data['BILL_AMT3'] = scaler.fit_transform(train_data['BILL_AMT3'].values.reshape(-1,1))
train_data['BILL_AMT4'] = scaler.fit_transform(train_data['BILL_AMT4'].values.reshape(-1,1))
train_data['BILL_AMT5'] = scaler.fit_transform(train_data['BILL_AMT5'].values.reshape(-1,1))
train_data['BILL_AMT6'] = scaler.fit_transform(train_data['BILL_AMT6'].values.reshape(-1,1))
train_data['PAY_AMT1'] = scaler.fit_transform(train_data['PAY_AMT1'].values.reshape(-1,1))
train_data['PAY_AMT2'] = scaler.fit_transform(train_data['PAY_AMT2'].values.reshape(-1,1))
train_data['PAY_AMT3'] = scaler.fit_transform(train_data['PAY_AMT3'].values.reshape(-1,1))
train_data['PAY_AMT4'] = scaler.fit_transform(train_data['PAY_AMT4'].values.reshape(-1,1))
train_data['PAY_AMT5'] = scaler.fit_transform(train_data['PAY_AMT5'].values.reshape(-1,1))
train_data['PAY_AMT6'] = scaler.fit_transform(train_data['PAY_AMT6'].values.reshape(-1,1))
#处理测试集
test_data=pd.read_csv(r'E:\文档\我的文档\大数据\题目和模板\Test0904.csv')
#先bining化
bins = [0,5,20,35,50,100]
test_data['LIMIT_BAL_bins'] = pd.cut(test_data['LIMIT_BAL'] / 10000,bins)
#factorize化
test_data['LIMIT_BAL_bins_id'] = pd.factorize(test_data['LIMIT_BAL_bins'])[0]
#删掉LIMIT_BAL_bins列
#test_data = test_data.drop(['LIMIT_BAL_bins'],axis=1)
#AGE以及BILL_AMT1到BILL_AMT6,PAY_AMT1到PAY_AMT6进行Scaling
test_data['AGE'] = scaler.fit_transform(test_data['AGE'].values.reshape(-1,1))
test_data['BILL_AMT1'] = scaler.fit_transform(test_data['BILL_AMT1'].values.reshape(-1,1))
test_data['BILL_AMT2'] = scaler.fit_transform(test_data['BILL_AMT2'].values.reshape(-1,1))
test_data['BILL_AMT3'] = scaler.fit_transform(test_data['BILL_AMT3'].values.reshape(-1,1))
test_data['BILL_AMT4'] = scaler.fit_transform(test_data['BILL_AMT4'].values.reshape(-1,1))
test_data['BILL_AMT5'] = scaler.fit_transform(test_data['BILL_AMT5'].values.reshape(-1,1))
test_data['BILL_AMT6'] = scaler.fit_transform(test_data['BILL_AMT6'].values.reshape(-1,1))
test_data['PAY_AMT1'] = scaler.fit_transform(test_data['PAY_AMT1'].values.reshape(-1,1))
test_data['PAY_AMT2'] = scaler.fit_transform(test_data['PAY_AMT2'].values.reshape(-1,1))
test_data['PAY_AMT3'] = scaler.fit_transform(test_data['PAY_AMT3'].values.reshape(-1,1))
test_data['PAY_AMT4'] = scaler.fit_transform(test_data['PAY_AMT4'].values.reshape(-1,1))
test_data['PAY_AMT5'] = scaler.fit_transform(test_data['PAY_AMT5'].values.reshape(-1,1))
test_data['PAY_AMT6'] = scaler.fit_transform(test_data['PAY_AMT6'].values.reshape(-1,1))
#建模前处理
train_data_X = train_data.drop(['y'],axis=1)
train_data_Y = train_data['y']
test_data_X = test_data.drop(['y'],axis=1)
test_data_Y = test_data['y']
#解决clf.fit()时的浮点类型错误
train_data_X['LIMIT_BAL_bins']= pd.cut(train_data_X['LIMIT_BAL'] , 4).cat.codes
test_data_X['LIMIT_BAL_bins']= pd.cut(test_data_X['LIMIT_BAL'] , 4).cat.codes
#LogisticRegression(逻辑回归)
#建立模型
from sklearn.linear_model import LogisticRegressionCV
clf = LogisticRegressionCV(
#penalty='l2',solver='lbfgs',n_jobs=2
)
clf.fit(train_data_X, train_data_Y)
#模型预测与评价
from sklearn.metrics import roc_auc_score
preds = clf.predict_proba(test_data_X)[:,1]
print(roc_auc_score(test_data_Y, preds))
from sklearn.metrics import roc_curve, auc
y_score = clf.fit(train_data_X, train_data_Y).predict_proba(test_data_X)
fpr, tpr, thresholds = roc_curve(test_data_Y, y_score[:, 1])
roc_auc = auc(fpr, tpr)
def drawRoc(roc_auc,fpr,tpr):
plt.subplots(figsize=(7, 5.5))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('LogisticRegression ROC Curve')
plt.legend(loc="lower right")
plt.show()
drawRoc(roc_auc, fpr, tpr)
'''
#roc
from sklearn.metrics import roc_curve, auc
y_score = clf.fit(train_data_X, train_data_Y).predict_proba(test_data_X)[:,1]
fpr,tpr,threshold = roc_curve(test_data_Y, y_score)
roc_auc = auc(fpr,tpr)
plt.figure()
lw = 2
plt.figure(figsize=(10,10))
plt.plot(fpr, tpr, color='darkorange',
lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) ###假正率为横坐标,真正率为纵坐标做曲线
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
'''
'''
#输出结果
test_data_ID = test_data['ID']
test_data_ID = pd.DataFrame(test_data_ID)
test_proba=pd.DataFrame(preds)
test_proba.columns=['proba']
test_target = pd.DataFrame(test_data_Y)
test_target.columns = ['target']
test_re = pd.concat([test_data_ID,test_proba,test_target],axis=1)
test_re.to_csv("test_LogisticRegression.csv",index=False,sep=',')
'''
test_data_ID = test_data['ID']
test_data_ID = pd.DataFrame(test_data_ID)
test_proba=pd.DataFrame(preds)
test_proba.columns=['proba']
test_re = pd.concat([test_data_ID,test_proba],axis=1)
'''
#test_target = pd.DataFrame(test_data_Y)
#test_target.columns = ['target']
#test_re = pd.concat([test_data_ID,test_proba,test_target],axis=1)
'''
test_target = test_proba
test_target[(test_target <= 0.5)] = 0
test_target[(test_target > 0.5)] = 1
test_target.columns=['target']
test_re = pd.concat([test_re,test_target],axis=1)
test_re.to_csv("测试test_RandomForest0904.csv",index=False,sep=',')