barclays_challenge_event_20.../model/model.py

70 lines
1.9 KiB
Python

import pandas as pd
import preprocessing as pp
from sklearn.ensemble import RandomForestClassifier
# Feature Conversion Values
categorical_columns = ["person_home_ownership",
"loan_intent",
"loan_grade",
"cb_person_default_on_file"]
person_home_ownership_values = {
"RENT": 1,
"MORTGAGE": 2,
"OWN": 3,
"OTHER": 4,
}
loan_intent_values = {
"EDUCATIONAL": 1,
"MEDICAL": 2,
"VENTURE": 3,
"PERSONAL": 4,
"DEBTCONSOLIDATION": 5
}
loan_grade_values = {
"A": 1,
"B": 2,
"C": 3,
"D": 4,
"E": 5
}
cb_person_default_on_file_values = {
"Y": 1,
"N": 0,
}
def get_default_predictor():
data = pd.from_csv("credit_risk_dataset.csv")
data = convert_categories(data)
# Imputation
data = pp.impute_missing_values(data)
# Training and Testing Preperation
train_features, train_target, test_features, test_target = pp.training_test_split(data, "loan_status")
# Normalise the data
training_features, test_features = pp.normalise(train_features,
test_features)
# Init Models
rf_model = random_forest_classifier(training_features,
train_target)
return rf_model
def random_forest_classifier(training_features, training_target):
model = RandomForestClassifier(max_features="log2",
random_state=79,
n_jobs=-1)
return model
def convert_categories(data):
data["person_home_ownership"] = data["person_home_ownership"].map(
person_home_ownership_values)
data["loan_intent"] = data["loan_intent"].map(loan_intent_values)
data["loan_grade"] = data["loan_grade"].map(loan_grade_values)
data["cb_person_default_on_file"] = data["cb_person_default_on_file"].map(
cb_person_default_on_file_values)
return data