贷款分析

1
2
3
4
5
6
import pandas as pd
loans_2007 = pd.read_csv('LoanStats3a.csv', skiprows=1)
half_count = len(loans_2007) / 2
loans_2007 = loans_2007.dropna(thresh=half_count, axis=1)
loans_2007 = loans_2007.drop(['desc', 'url'],axis=1)
loans_2007.to_csv('loans_2007.csv', index=False)
D:\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:2717: DtypeWarning: Columns (0,47) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
1
2
3
4
5
import pandas as pd
loans_2007 = pd.read_csv("loans_2007.csv")
#loans_2007.drop_duplicates()
print(loans_2007.iloc[0].head(15))
print(loans_2007.shape[1]) #共52个特征
id                        1077501
member_id              1.2966e+06
loan_amnt                    5000
funded_amnt                  5000
funded_amnt_inv              4975
term                    36 months
int_rate                   10.65%
installment                162.87
grade                           B
sub_grade                      B2
emp_title                     NaN
emp_length              10+ years
home_ownership               RENT
annual_inc                  24000
verification_status      Verified
Name: 0, dtype: object
52


D:\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:2717: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

预处理——数据清洗

先直观上 去掉不需要的特征。
包括: 1.预测后才出现的特征(比如:实际发放的贷款)、2.高度相关的特征(比如:123和ABC)、3。关系不大的特征(比如:ID等)

1
2
loans_2007 = loans_2007.drop(["id", "member_id", "funded_amnt", "funded_amnt_inv", "grade", "sub_grade", "emp_title", "issue_d"], axis=1)
loans_2007 = loans_2007.drop(["zip_code", "out_prncp", "out_prncp_inv", "total_pymnt", "total_pymnt_inv", "total_rec_prncp"], axis=1)
1
2
3
loans_2007 = loans_2007.drop(["total_rec_int", "total_rec_late_fee", "recoveries", "collection_recovery_fee", "last_pymnt_d", "last_pymnt_amnt"], axis=1)
print(loans_2007.iloc[0])
print(loans_2007.shape[1])
loan_amnt                            5000
term                            36 months
int_rate                           10.65%
installment                        162.87
emp_length                      10+ years
home_ownership                       RENT
annual_inc                          24000
verification_status              Verified
loan_status                    Fully Paid
pymnt_plan                              n
purpose                       credit_card
title                            Computer
addr_state                             AZ
dti                                 27.65
delinq_2yrs                             0
earliest_cr_line                 Jan-1985
inq_last_6mths                          1
open_acc                                3
pub_rec                                 0
revol_bal                           13648
revol_util                          83.7%
total_acc                               9
initial_list_status                     f
last_credit_pull_d               Nov-2016
collections_12_mths_ex_med              0
policy_code                             1
application_type               INDIVIDUAL
acc_now_delinq                          0
chargeoff_within_12_mths                0
delinq_amnt                             0
pub_rec_bankruptcies                    0
tax_liens                               0
Name: 0, dtype: object
32

预处理——label列属性数字替换

对label属性进行统计,选择适合用于分类的属性

1
print(loans_2007['loan_status'].value_counts())
Fully Paid                                             33902
Charged Off                                             5658
Does not meet the credit policy. Status:Fully Paid      1988
Does not meet the credit policy. Status:Charged Off      761
Current                                                  201
Late (31-120 days)                                        10
In Grace Period                                            9
Late (16-30 days)                                          5
Default                                                    1
Name: loan_status, dtype: int64
1
2
3
4
5
6
7
8
9
10
11
12
13
# 属性为Fully Paid 和 Charged off 的替换为 1 和 0
# 只取这部分数据
loans_2007 = loans_2007[(loans_2007['loan_status'] == "Fully Paid") | (loans_2007['loan_status'] == "Charged Off")]

# 替换指定列的 指定属性为 指定的值★
status_replace = {
"loan_status" : {
"Fully Paid": 1,
"Charged Off": 0,
}
}

loans_2007 = loans_2007.replace(status_replace)

预处理——最后再去掉列属性只有一个的列

1
2
3
4
5
6
7
8
9
10
11
12
#let's look for any columns that contain only one unique value and remove them

orig_columns = loans_2007.columns
drop_columns = []
for col in orig_columns:
col_series = loans_2007[col].dropna().unique() #这里要去掉空值nan以后 再判定列的属性是否只有一个
if len(col_series) == 1:
drop_columns.append(col)
loans_2007 = loans_2007.drop(drop_columns, axis=1)
print(drop_columns)
print(loans_2007.shape)
loans_2007.to_csv('filtered_loans_2007.csv', index=False)
['initial_list_status', 'collections_12_mths_ex_med', 'policy_code', 'application_type', 'acc_now_delinq', 'chargeoff_within_12_mths', 'delinq_amnt', 'tax_liens']
(39560, 24)

处理完毕,最终得到24列数据

预处理——缺失值

先查找列缺失值多的,去掉这些列,只剩下缺失值少的列。之后直接去掉那些样本行即可

1
2
3
4
import pandas as pd
loans = pd.read_csv('filtered_loans_2007.csv')
null_counts = loans.isnull().sum()
print(null_counts)
loan_amnt                 0
term                      0
int_rate                  0
installment               0
emp_length                0
home_ownership            0
annual_inc                0
verification_status       0
loan_status               0
pymnt_plan                0
purpose                   0
title                    10
addr_state                0
dti                       0
delinq_2yrs               0
earliest_cr_line          0
inq_last_6mths            0
open_acc                  0
pub_rec                   0
revol_bal                 0
revol_util               50
total_acc                 0
last_credit_pull_d        2
pub_rec_bankruptcies    697
dtype: int64
1
2
loans = loans.drop("pub_rec_bankruptcies", axis=1)
loans = loans.dropna(axis=0)
object     12
float64    10
int64       1
dtype: int64
1
2
# 处理完缺失值,统计每种数据类型的列 有几个
print(loans.dtypes.value_counts())

预处理——字符串值转换

1
2
object_columns_df = loans.select_dtypes(include=["object"])
print(object_columns_df.iloc[0])
term                     36 months
int_rate                    10.65%
emp_length               10+ years
home_ownership                RENT
verification_status       Verified
pymnt_plan                       n
purpose                credit_card
title                     Computer
addr_state                      AZ
earliest_cr_line          Jan-1985
revol_util                   83.7%
last_credit_pull_d        Nov-2016
Name: 0, dtype: object
1
2
3
cols = ['home_ownership', 'verification_status', 'emp_length', 'term', 'addr_state']
for c in cols:
print(loans[c].value_counts())
RENT        18780
MORTGAGE    17574
OWN          3045
OTHER          96
NONE            3
Name: home_ownership, dtype: int64
Not Verified       16856
Verified           12705
Source Verified     9937
Name: verification_status, dtype: int64
10+ years    8821
< 1 year     4563
2 years      4371
3 years      4074
4 years      3409
5 years      3270
1 year       3227
6 years      2212
7 years      1756
8 years      1472
9 years      1254
n/a          1069
Name: emp_length, dtype: int64
 36 months    29041
 60 months    10457
Name: term, dtype: int64
CA    7070
NY    3788
FL    2856
TX    2714
NJ    1838
IL    1517
PA    1504
VA    1400
GA    1393
MA    1336
OH    1208
MD    1049
AZ     874
WA     834
CO     786
NC     780
CT     747
MI     722
MO     682
MN     611
NV     492
SC     470
WI     453
AL     446
OR     445
LA     435
KY     325
OK     298
KS     269
UT     256
AR     243
DC     211
RI     198
NM     188
WV     176
HI     172
NH     172
DE     113
MT      84
WY      83
AK      79
SD      63
VT      54
MS      19
TN      17
IN       9
ID       6
IA       5
NE       5
ME       3
Name: addr_state, dtype: int64
1
2
3
# 这2个特征内容差不多,选择去掉title列
print(loans["purpose"].value_counts())
print(loans["title"].value_counts())
debt_consolidation    18533
credit_card            5099
other                  3963
home_improvement       2965
major_purchase         2181
small_business         1815
car                    1544
wedding                 945
medical                 692
moving                  581
vacation                379
house                   378
educational             320
renewable_energy        103
Name: purpose, dtype: int64
Debt Consolidation                         2168
Debt Consolidation Loan                    1706
Personal Loan                               658
Consolidation                               509
debt consolidation                          502
Credit Card Consolidation                   356
Home Improvement                            354
Debt consolidation                          333
Small Business Loan                         322
Credit Card Loan                            313
Personal                                    308
Consolidation Loan                          255
Home Improvement Loan                       246
personal loan                               234
personal                                    220
Loan                                        212
Wedding Loan                                209
consolidation                               200
Car Loan                                    200
Other Loan                                  190
Credit Card Payoff                          155
Wedding                                     152
Major Purchase Loan                         144
Credit Card Refinance                       143
Consolidate                                 127
Medical                                     122
Credit Card                                 117
home improvement                            111
My Loan                                      94
Credit Cards                                 93
                                           ... 
DebtConsolidationn                            1
 Freedom                                      1
Credit Card Consolidation Loan - SEG          1
SOLAR PV                                      1
Pay on Credit card                            1
To pay off balloon payments due               1
Paying off the debt                           1
Payoff ING PLOC                               1
Josh CC Loan                                  1
House payoff                                  1
Taking care of Business                       1
Gluten Free Bakery in ideal town for it       1
Startup Money for Small Business              1
FundToFinanceCar                              1
getting ready for Baby                        1
Dougs Wedding Loan                            1
d rock                                        1
LC Loan 2                                     1
swimming pool repair                          1
engagement                                    1
Cut the credit cards Loan                     1
vinman                                        1
working hard to get out of debt               1
consolidate the rest of my debt               1
Medical/Vacation                              1
2BDebtFree                                    1
Paying Off High Interest Credit Cards!        1
Baby on the way!                              1
cart loan                                     1
Consolidaton                                  1
Name: title, dtype: int64
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
mapping_dict = {
"emp_length": {
"10+ years": 10,
"9 years": 9,
"8 years": 8,
"7 years": 7,
"6 years": 6,
"5 years": 5,
"4 years": 4,
"3 years": 3,
"2 years": 2,
"1 year": 1,
"< 1 year": 0,
"n/a": 0
}
}
loans = loans.drop(["last_credit_pull_d", "earliest_cr_line", "addr_state", "title"], axis=1)
loans["int_rate"] = loans["int_rate"].str.rstrip("%").astype("float")
loans["revol_util"] = loans["revol_util"].str.rstrip("%").astype("float")
loans = loans.replace(mapping_dict)
1
2
3
4
5
6

cat_columns = ["home_ownership", "verification_status", "emp_length", "purpose", "term"]
dummy_df = pd.get_dummies(loans[cat_columns])
loans = pd.concat([loans, dummy_df], axis=1)
loans = loans.drop(cat_columns, axis=1)
loans = loans.drop("pymnt_plan", axis=1)
1
loans.to_csv('cleaned_loans2007.csv', index=False)

模型训练、评估、调参——训练目标是盈利最大化

模型训练目标:

是确保TP的数量至少是FP的10倍。
然而这里不适合使用精度,故我们选择 TPR 和 FPR ,要使 TPR尽可能大, FPR 尽可能小

1
2
3
import pandas as pd
loans = pd.read_csv("cleaned_loans2007.csv")
print(loans.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39498 entries, 0 to 39497
Data columns (total 37 columns):
loan_amnt                              39498 non-null float64
int_rate                               39498 non-null float64
installment                            39498 non-null float64
annual_inc                             39498 non-null float64
loan_status                            39498 non-null int64
dti                                    39498 non-null float64
delinq_2yrs                            39498 non-null float64
inq_last_6mths                         39498 non-null float64
open_acc                               39498 non-null float64
pub_rec                                39498 non-null float64
revol_bal                              39498 non-null float64
revol_util                             39498 non-null float64
total_acc                              39498 non-null float64
home_ownership_MORTGAGE                39498 non-null int64
home_ownership_NONE                    39498 non-null int64
home_ownership_OTHER                   39498 non-null int64
home_ownership_OWN                     39498 non-null int64
home_ownership_RENT                    39498 non-null int64
verification_status_Not Verified       39498 non-null int64
verification_status_Source Verified    39498 non-null int64
verification_status_Verified           39498 non-null int64
purpose_car                            39498 non-null int64
purpose_credit_card                    39498 non-null int64
purpose_debt_consolidation             39498 non-null int64
purpose_educational                    39498 non-null int64
purpose_home_improvement               39498 non-null int64
purpose_house                          39498 non-null int64
purpose_major_purchase                 39498 non-null int64
purpose_medical                        39498 non-null int64
purpose_moving                         39498 non-null int64
purpose_other                          39498 non-null int64
purpose_renewable_energy               39498 non-null int64
purpose_small_business                 39498 non-null int64
purpose_vacation                       39498 non-null int64
purpose_wedding                        39498 non-null int64
term_ 36 months                        39498 non-null int64
term_ 60 months                        39498 non-null int64
dtypes: float64(12), int64(25)
memory usage: 11.1 MB
None
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
import pandas as pd
# False positives.
fp_filter = (predictions == 1) & (loans["loan_status"] == 0)
fp = len(predictions[fp_filter])

# True positives.
tp_filter = (predictions == 1) & (loans["loan_status"] == 1)
tp = len(predictions[tp_filter])

# False negatives.
fn_filter = (predictions == 0) & (loans["loan_status"] == 1)
fn = len(predictions[fn_filter])

# True negatives
tn_filter = (predictions == 0) & (loans["loan_status"] == 0)
tn = len(predictions[tn_filter])
1
2
3
4
5
6
7
8
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
cols = loans.columns
train_cols = cols.drop("loan_status")
features = loans[train_cols]
target = loans["loan_status"]
lr.fit(features, target)
predictions = lr.predict(features)

第一次,逻辑回归

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_predict, KFold
lr = LogisticRegression()
kf = KFold(features.shape[0], random_state=1)
predictions = cross_val_predict(lr, features, target, cv=kf)
predictions = pd.Series(predictions)

# False positives.
fp_filter = (predictions == 1) & (loans["loan_status"] == 0)
fp = len(predictions[fp_filter])

# True positives.
tp_filter = (predictions == 1) & (loans["loan_status"] == 1)
tp = len(predictions[tp_filter])

# False negatives.
fn_filter = (predictions == 0) & (loans["loan_status"] == 1)
fn = len(predictions[fn_filter])

# True negatives
tn_filter = (predictions == 0) & (loans["loan_status"] == 0)
tn = len(predictions[tn_filter])

# Rates
tpr = tp / float((tp + fn))
fpr = fp / float((fp + tn))

print(tpr)
print(fpr)
print predictions[:20]
0.999084438406
0.998049299521
0     1
1     1
2     1
3     1
4     1
5     1
6     1
7     1
8     1
9     1
10    1
11    1
12    1
13    1
14    1
15    1
16    1
17    1
18    1
19    1
dtype: int64

因为样本不均衡导致效果不好。

第二次,逻辑回归,设置模型参数 class_weight=”balanced” ,让模型自动添加惩罚权重

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_predict
lr = LogisticRegression(class_weight="balanced")
kf = KFold(features.shape[0], random_state=1)
predictions = cross_val_predict(lr, features, target, cv=kf)
predictions = pd.Series(predictions)

# False positives.
fp_filter = (predictions == 1) & (loans["loan_status"] == 0)
fp = len(predictions[fp_filter])

# True positives.
tp_filter = (predictions == 1) & (loans["loan_status"] == 1)
tp = len(predictions[tp_filter])

# False negatives.
fn_filter = (predictions == 0) & (loans["loan_status"] == 1)
fn = len(predictions[fn_filter])

# True negatives
tn_filter = (predictions == 0) & (loans["loan_status"] == 0)
tn = len(predictions[tn_filter])

# Rates
tpr = tp / float((tp + fn))
fpr = fp / float((fp + tn))

print(tpr)
print(fpr)
print predictions[:20]
0.670781771464
0.400780280192
0     1
1     0
2     0
3     1
4     1
5     0
6     0
7     0
8     0
9     0
10    1
11    0
12    1
13    1
14    0
15    0
16    1
17    1
18    1
19    0
dtype: int64

模型终于起了效果,但还是不理想

第三次,逻辑回归,手动调节 cclass_weight 参数设置惩罚系数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_predict
penalty = {
0: 5,
1: 1
}

lr = LogisticRegression(class_weight=penalty)
kf = KFold(features.shape[0], random_state=1)
predictions = cross_val_predict(lr, features, target, cv=kf)
predictions = pd.Series(predictions)

# False positives.
fp_filter = (predictions == 1) & (loans["loan_status"] == 0)
fp = len(predictions[fp_filter])

# True positives.
tp_filter = (predictions == 1) & (loans["loan_status"] == 1)
tp = len(predictions[tp_filter])

# False negatives.
fn_filter = (predictions == 0) & (loans["loan_status"] == 1)
fn = len(predictions[fn_filter])

# True negatives
tn_filter = (predictions == 0) & (loans["loan_status"] == 0)
tn = len(predictions[tn_filter])

# Rates
tpr = tp / float((tp + fn))
fpr = fp / float((fp + tn))

print(tpr)
print(fpr)
0.731799521545
0.478985635751

效果又好了一些

第四次,随机森林

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_predict
rf = RandomForestClassifier(n_estimators=10,class_weight="balanced", random_state=1)
#print help(RandomForestClassifier)
kf = KFold(features.shape[0], random_state=1)
predictions = cross_val_predict(rf, features, target, cv=kf)
predictions = pd.Series(predictions)

# False positives.
fp_filter = (predictions == 1) & (loans["loan_status"] == 0)
fp = len(predictions[fp_filter])

# True positives.
tp_filter = (predictions == 1) & (loans["loan_status"] == 1)
tp = len(predictions[tp_filter])

# False negatives.
fn_filter = (predictions == 0) & (loans["loan_status"] == 1)
fn = len(predictions[fn_filter])

# True negatives
tn_filter = (predictions == 0) & (loans["loan_status"] == 0)
tn = len(predictions[tn_filter])

# Rates
tpr = tp / float((tp + fn))
fpr = fp / float((fp + tn))
꧁༺The༒End༻꧂