# -*- coding: utf-8 -*-
"""
Created on Sat Sep 19 19:28:29 2020
@author: 谢树鹏
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#%matplotlib inline
import seaborn as sns
import warnings
import matplotlib
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)
train_data= pd.read_csv(r'E:\文档\我的文档\大数据\题目和模板\Train.csv')
train_data.rename(columns={'PAY_0':'PAY_1'},inplace= True)
matplotlib.rcParams['font.sans-serif'] = ['KaiTi']
matplotlib.rcParams['font.serif'] = ['KaiTi']
sns.set_style("darkgrid",{"font.sans-serif":['KaiTi', 'Arial']})
matplotlib.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题,或者转换负号为字符串
#对LIMIT_BAL(信用额度)进行Bining化后再进行Factorize化
bins = [1,5,20,35,50,100]#如何分级比较好?
train_data['LIMIT_BAL_bins'] = pd.cut(train_data['LIMIT_BAL'] / 10000,bins)
by_bins = train_data.groupby('LIMIT_BAL_bins')['y'].mean()
#print(by_bins)
train_data['LIMIT_BAL_bins_id'] = pd.factorize(train_data['LIMIT_BAL_bins'])[0]
train_data['LIMIT_BAL_bins_id'].unique()
#对AGE进行scaling
from sklearn import preprocessing
scaler = preprocessing.StandardScaler()
train_data['AGE'] = scaler.fit_transform(train_data['AGE'].values.reshape(-1,1))
train_data
#对BILL_AMT1到BILL_AMT6,PAY_AMT1到PAY_AMT6进行Scaling
train_data['BILL_AMT1'] = scaler.fit_transform(train_data['BILL_AMT1'].values.reshape(-1,1))
train_data['BILL_AMT2'] = scaler.fit_transform(train_data['BILL_AMT2'].values.reshape(-1,1))
train_data['BILL_AMT3'] = scaler.fit_transform(train_data['BILL_AMT3'].values.reshape(-1,1))
train_data['BILL_AMT4'] = scaler.fit_transform(train_data['BILL_AMT4'].values.reshape(-1,1))
train_data['BILL_AMT5'] = scaler.fit_transform(train_data['BILL_AMT5'].values.reshape(-1,1))
train_data['BILL_AMT6'] = scaler.fit_transform(train_data['BILL_AMT6'].values.reshape(-1,1))
train_data['PAY_AMT1'] = scaler.fit_transform(train_data['PAY_AMT1'].values.reshape(-1,1))
train_data['PAY_AMT2'] = scaler.fit_transform(train_data['PAY_AMT2'].values.reshape(-1,1))
train_data['PAY_AMT3'] = scaler.fit_transform(train_data['PAY_AMT3'].values.reshape(-1,1))
train_data['PAY_AMT4'] = scaler.fit_transform(train_data['PAY_AMT4'].values.reshape(-1,1))
train_data['PAY_AMT5'] = scaler.fit_transform(train_data['PAY_AMT5'].values.reshape(-1,1))
train_data['PAY_AMT6'] = scaler.fit_transform(train_data['PAY_AMT6'].values.reshape(-1,1))
#处理测试集
test_data=pd.read_csv(r'E:\文档\我的文档\大数据\题目和模板\Test0904.csv')
#先bining化
bins = [0,5,20,35,50,100]
test_data['LIMIT_BAL_bins'] = pd.cut(test_data['LIMIT_BAL'] / 10000,bins)
#factorize化
test_data['LIMIT_BAL_bins_id'] = pd.factorize(test_data['LIMIT_BAL_bins'])[0]
#删掉LIMIT_BAL_bins列
#test_data = test_data.drop(['LIMIT_BAL_bins'],axis=1)
#AGE以及BILL_AMT1到BILL_AMT6,PAY_AMT1到PAY_AMT6进行Scaling
test_data['AGE'] = scaler.fit_transform(test_data['AGE'].values.reshape(-1,1))
test_data['BILL_AMT1'] = scaler.fit_transform(test_data['BILL_AMT1'].values.reshape(-1,1))
test_data['BILL_AMT2'] = scaler.fit_transform(test_data['BILL_AMT2'].values.reshape(-1,1))
test_data['BILL_AMT3'] = scaler.fit_transform(test_data['BILL_AMT3'].values.reshape(-1,1))
test_data['BILL_AMT4'] = scaler.fit_transform(test_data['BILL_AMT4'].values.reshape(-1,1))
test_data['BILL_AMT5'] = scaler.fit_transform(test_data['BILL_AMT5'].values.reshape(-1,1))
test_data['BILL_AMT6'] = scaler.fit_transform(test_data['BILL_AMT6'].values.reshape(-1,1))
test_data['PAY_AMT1'] = scaler.fit_transform(test_data['PAY_AMT1'].values.reshape(-1,1))
test_data['PAY_AMT2'] = scaler.fit_transform(test_data['PAY_AMT2'].values.reshape(-1,1))
test_data['PAY_AMT3'] = scaler.fit_transform(test_data['PAY_AMT3'].values.reshape(-1,1))
test_data['PAY_AMT4'] = scaler.fit_transform(test_data['PAY_AMT4'].values.reshape(-1,1))
test_data['PAY_AMT5'] = scaler.fit_transform(test_data['PAY_AMT5'].values.reshape(-1,1))
test_data['PAY_AMT6'] = scaler.fit_transform(test_data['PAY_AMT6'].values.reshape(-1,1))
#建模前处理
train_data_X = train_data.drop(['y'],axis=1)
train_data_Y = train_data['y']
test_data_X = test_data.drop(['y'],axis=1)
test_data_Y = test_data['y']
#解决clf.fit()时的浮点类型错误
train_data_X['LIMIT_BAL_bins']= pd.cut(train_data_X['LIMIT_BAL'] , 4).cat.codes
test_data_X['LIMIT_BAL_bins']= pd.cut(test_data_X['LIMIT_BAL'] , 4).cat.codes
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
train_data_X = sc.fit_transform(train_data_X)
test_data_X = sc.transform(test_data_X)
#K近邻(KNN)
#建立模型
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=5,p=2,metric="minkowski")
clf.fit(train_data_X, train_data_Y)
#模型预测与评价
from sklearn.metrics import roc_auc_score
preds = clf.predict_proba(test_data_X)[:,1]
print(roc_auc_score(test_data_Y, preds))
'''
#输出结果
test_data_ID = test_data['ID']
test_data_ID = pd.DataFrame(test_data_ID)
test_proba=pd.DataFrame(preds)
test_proba.columns=['proba']
test_re = pd.concat([test_data_ID,test_proba],axis=1)
'''
#test_target = pd.DataFrame(test_data_Y)
#test_target.columns = ['target']
#test_re = pd.concat([test_data_ID,test_proba,test_target],axis=1)
'''
test_target = test_proba
test_target[(test_target <= 0.5)] = 0
test_target[(test_target > 0.5)] = 1
test_target.columns=['target']
test_re = pd.concat([test_re,test_target],axis=1)
test_re.to_csv("test_RandomForest0904a.csv",index=False,sep=',')
'''