barclays_challenge_event_20.../missing_values.py

41 lines
1.1 KiB
Python

import pandas as pd
from sklearn.impute import KNNImputer
dataframe = pd.read_csv("credit_risk_dataset.csv")
# Feature Conversion Values
person_home_ownership_values = {
"RENT": 1,
"MORTGAGE": 2,
"OWN": 3,
"OTHER": 4,
}
loan_intent_values = {
"EDUCATIONAL": 1,
"MEDICAL": 2,
"VENTURE": 3,
"PERSONAL": 4,
"DEBTCONSOLIDATION": 5
}
loan_grade_values = {
"A": 1,
"B": 2,
"C": 3,
"D": 4,
"E": 5
}
cb_person_default_on_file_values = {
"Y": 1,
"N": 0,
}
dataframe["person_home_ownership"] = dataframe["person_home_ownership"].map(person_home_ownership_values)
dataframe["loan_intent"] = dataframe["loan_intent"].map(loan_intent_values)
dataframe["loan_grade"] = dataframe["loan_grade"].map(loan_grade_values)
dataframe["cb_person_default_on_file"] = dataframe["cb_person_default_on_file"].map(cb_person_default_on_file_values)
imputer = KNNImputer(n_neighbors=9, weights="uniform", metric="nan_euclidean")
imputed_data = imputer.fit_transform(dataframe)
pd.DataFrame(imputed_data,
columns=dataframe.columns).to_csv("imputed_data.csv", index=False)