datawhale-Task2-iv值特征处理
数据集下载:https://pan.baidu.com/s/1wO9qJRjnrm8uhaSP67K0lw
# -*- coding: utf-8 -*-
"""
@author: Administrator
"""
import pandas as pd
import numpy as np
def calc_iv(df, feature, target, pr=False):
"""
Set pr=True to enable printing of output.
Output:
* iv: float,
* data: pandas.DataFrame
"""
lst = []
df[feature] = df[feature].fillna('NULL')
for i in range(df[feature].nunique()): # nuinque()是查看该序列(axis=0/1对应着列或行)的不同值的数量个数
val = list(df[feature].unique())[i]
lst.append([feature,
val, # Value
df[df[feature] == val].count()[feature], # all
df[(df[feature] == val) & (df[target] == 0)].count()[feature], # good rate
df[(df[feature] == val) & (df[target] == 1)].count()[feature]]) # bad rate
data = pd.DataFrame(lst, columns=['Variable', 'Value', 'All', 'Good', 'Bad'])
data['Share'] = data['All'] / data['All'].sum()
data['Bad Rate'] = data['Bad'] / data['All']
data['Distribution Good'] = (data['All'] - data['Bad']) / (data['All'].sum() - data['Bad'].sum())
data['Distribution Bad'] = data['Bad'] / data['Bad'].sum()
data['WoE'] = np.log(data['Distribution Good'] / data['Distribution Bad'])
data = data.replace({'WoE': {np.inf: 0, -np.inf: 0}})
data['IV'] = data['WoE'] * (data['Distribution Good'] - data['Distribution Bad'])
data = data.sort_values(by=['Variable', 'Value'], ascending=[True, True])
data.index = range(len(data.index))
if pr:
print(data)
print("IV = ", data['IV'].sum())
iv = data['IV'].sum()
return iv, data
df = pd.read_csv(r'E:\AIprojectspace\datawhale_dataanly\data\data.csv', encoding='gbk')
column_headers = list(df.columns.values)
# print(column_headers)
del column_headers[11]
for x in column_headers:
IV_1, data = calc_iv(df, x, 'status')
print('{}: {}'.format(x, IV_1))
# for example:
# calc_iv(df, 'NET_TM', 'overdue')
E:\devtool\anconda\python.exe E:/AIprojectspace/datawhale_dataanly/iv_util.py
E:/AIprojectspace/datawhale_dataanly/iv_util.py:37: RuntimeWarning: divide by zero encountered in log
data['WoE'] = np.log(data['Distribution Good'] / data['Distribution Bad'])
Unnamed: 0: 0.0
custid: 0.0
trade_no: 0.0
bank_card_no: 0.0
low_volume_percent: 0.03434912946670754
middle_volume_percent: 0.07118473287723781
take_amount_in_later_12_month_highest: 0.08329868129431747
trans_amount_increase_rate_lately: 0.2883857251682308
trans_activity_month: 0.0728243237636469
trans_activity_day: 0.36466249883484175
transd_mcc: 0.03615978377155986
trans_days_interval: 0.09452853184236024
regional_mobility: 0.007908925668911125
student_feature: 0.0009555051797128911
repayment_capability: 0.4390959671989395
is_high_user: 0.005613886645064013
number_of_trans_from_2011: 0.057930909034620885
first_transaction_time: 0.4861222320898152
historical_trans_amount: 0.053932681011240866
historical_trans_day: 0.3525315105292556
rank_trad_1_month: 0.12800012489827373
trans_amount_3_month: 0.24484789224813222
avg_consume_less_12_valid_month: 0.01366176736863671
abs: 0.3727207019325916
top_trans_count_last_1_month: 0.07238437681409188
avg_price_last_12_month: 0.19973803729124645
avg_price_top_last_12_valid_month: 0.0448799551161269
reg_preference_for_trad: 0.006067507102377457
trans_top_time_last_1_month: 0.10181052061837485
trans_top_time_last_6_month: 0.06783759655355802
consume_top_time_last_1_month: 0.09524841540395028
consume_top_time_last_6_month: 0.07432209918398235
cross_consume_count_last_1_month: 0.013730867661866968
trans_fail_top_count_enum_last_1_month: 0.5945548896319728
trans_fail_top_count_enum_last_6_month: 0.2783577754538106
trans_fail_top_count_enum_last_12_month: 0.2649286825335892
consume_mini_time_last_1_month: 0.0899281620326396
max_cumulative_consume_later_1_month: 0.3956491245615908
max_consume_count_later_6_month: 0.03062688466670764
railway_consume_count_last_12_month: 0.006307128262682564
pawns_auctions_trusts_consume_last_1_month: 0.19699501522871354
pawns_auctions_trusts_consume_last_6_month: 0.3389607214125947
jewelry_consume_count_last_6_month: 0.0019510185472895417
status: 0.0
source: 0.0
first_transaction_day: 0.4861222320898152
trans_day_last_12_month: 0.1693760273268455
id_name: 0.04357457727654854
apply_score: 0.4918946855152253
apply_credibility: 0.0407016198526198
query_org_count: 0.028526967528461396
query_finance_count: 0.01883760825079841
query_cash_count: 0.03182175138557366
query_sum_count: 0.0564050770802192
latest_query_time: 0.194805226642488
latest_one_month_apply: 0.051393502672567204
latest_three_month_apply: 0.049732356675958926
latest_six_month_apply: 0.039731457198607247
loans_score: 0.6214925431726865
loans_credibility_behavior: 0.015147189770975388
loans_count: 0.13654747372185833
loans_settle_count: 0.07975696874387915
loans_overdue_count: 0.3923359913425925
loans_org_count_behavior: 0.03235501235370871
consfin_org_count_behavior: 0.024380821724102025
loans_cash_count: 0.02974705927366485
latest_one_month_loan: 0.016360656963708916
latest_three_month_loan: 0.04463582457502856
latest_six_month_loan: 0.07556114729031406
history_suc_fee: 0.1295224405045246
history_fail_fee: 0.5133547604785954
latest_one_month_suc: 0.1505871178724152
latest_one_month_fail: 0.25593109317817125
loans_long_time: 0.10917949026234713
loans_latest_time: 0.18055372690968458
loans_credit_limit: 0.036173068257932414
loans_credibility_limit: 0.023876694793468937
loans_org_count_current: 0.02974705927366485
loans_product_count: 0.030767681943107732
loans_max_limit: 0.0810045117961755
loans_avg_limit: 0.3522987684188826
consfin_credit_limit: 0.2147374492843147
consfin_credibility: 0.028108692856604525
consfin_org_count_current: 0.024380821724102025
consfin_product_count: 0.02591024563815903
consfin_max_limit: 0.11093025966669148
consfin_avg_limit: 0.2751209316677875
latest_query_day: 0.14175486373224241
loans_latest_day: 0.1771739565352234
Process finished with exit code 0
参考https://blog.****.net/weixin_35688006/article/details/88425262