ml model
This commit is contained in:
commit
2371f14c3c
|
|
@ -0,0 +1,5 @@
|
|||
__pycache__
|
||||
build
|
||||
barclays_credit_classifier.egg-info
|
||||
.env
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,77 @@
|
|||
import models
|
||||
import pandas as pd
|
||||
import preprocessing as pp
|
||||
from sklearn.metrics import accuracy_score
|
||||
|
||||
# Load Data
|
||||
credit_risk = pd.read_csv("credit_risk_dataset.csv")
|
||||
|
||||
# Feature Addition
|
||||
|
||||
|
||||
# Feature Conversion
|
||||
|
||||
person_home_ownership_values = {
|
||||
"RENT": 1,
|
||||
"MORTGAGE": 2,
|
||||
"OWN": 3,
|
||||
"OTHER": 4,
|
||||
}
|
||||
loan_intent_values = {
|
||||
"EDUCATIONAL": 1,
|
||||
"MEDICAL": 2,
|
||||
"VENTURE": 3,
|
||||
"PERSONAL": 4,
|
||||
"DEBTCONSOLIDATION": 5
|
||||
}
|
||||
loan_grade_values = {
|
||||
"A": 1,
|
||||
"B": 2,
|
||||
"C": 3,
|
||||
"D": 4,
|
||||
"E": 5
|
||||
}
|
||||
cb_person_default_on_file_values = {
|
||||
"Y": 1,
|
||||
"N": 0,
|
||||
}
|
||||
|
||||
# Convert categorical column to a numerical column
|
||||
credit_risk["person_home_ownership"] = credit_risk["person_home_ownership"].map(person_home_ownership_values)
|
||||
credit_risk["loan_intent"] = credit_risk["loan_intent"].map(loan_intent_values)
|
||||
credit_risk["loan_grade"] = credit_risk["loan_grade"].map(loan_grade_values)
|
||||
credit_risk["cb_person_default_on_file"] = credit_risk["cb_person_default_on_file"].map(cb_person_default_on_file_values)
|
||||
|
||||
print("Feature Conversion Complete")
|
||||
|
||||
# Feature Removal
|
||||
# columns_for_removal = ["housing_median_age", "total_rooms", "total_bedrooms"]
|
||||
# for column in columns_for_removal:
|
||||
# housing.drop(column, axis=1, inplace=True)
|
||||
|
||||
# Preprocessing
|
||||
credit_risk = pp.impute_missing_values(credit_risk) # Handle missing values
|
||||
print("Missing Values handling Complete")
|
||||
# housing = pp.remove_outliers(housing) #Remove outliers
|
||||
|
||||
# Training and Testing Preperation
|
||||
training_features, training_target_value, test_features, test_target_value = pp.training_test_split(credit_risk, "loan_status") # Split the data into Training and Test sets
|
||||
print("Training and Test features split Complete")
|
||||
|
||||
# Normalise the data
|
||||
training_features, test_features = pp.normalise(training_features,
|
||||
test_features)
|
||||
print("Normalisation Complete")
|
||||
# Init Models
|
||||
rf_model = models.random_forest_classifier(training_features,
|
||||
training_target_value)
|
||||
print("Model Init Complete")
|
||||
|
||||
# Get Predictions
|
||||
rf_predictions = rf_model.predict(test_features)
|
||||
print("Predictions Complete")
|
||||
|
||||
# Compare Results
|
||||
accuracy = accuracy_score(test_target_value, rf_predictions)
|
||||
print(f"Accuracy: {accuracy}")
|
||||
print(rf_predictions)
|
||||
|
|
@ -0,0 +1,9 @@
|
|||
from sklearn.ensemble import RandomForestClassifier
|
||||
|
||||
|
||||
def random_forest_classifier(training_features, training_target):
|
||||
model = RandomForestClassifier(max_features="log2",
|
||||
random_state=79,
|
||||
n_jobs=-1)
|
||||
model.fit(training_features, training_target)
|
||||
return model
|
||||
|
|
@ -0,0 +1,36 @@
|
|||
import pandas as pd
|
||||
from scipy import stats
|
||||
from sklearn.impute import KNNImputer
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.preprocessing import MinMaxScaler
|
||||
|
||||
|
||||
def training_test_split(data_frame, target_variable):
|
||||
data_train, data_test = train_test_split(data_frame, train_size=0.75,
|
||||
random_state=79, shuffle=True)
|
||||
training_target = data_train[target_variable]
|
||||
test_target = data_test[target_variable]
|
||||
training_features = data_train.drop(target_variable, axis=1)
|
||||
test_features = data_test.drop(target_variable, axis=1)
|
||||
return training_features, training_target, test_features, test_target
|
||||
|
||||
|
||||
def normalise(training_features, test_features):
|
||||
scaler = MinMaxScaler().fit(training_features)
|
||||
return scaler.transform(training_features), scaler.transform(test_features)
|
||||
|
||||
|
||||
def impute_missing_values(data_frame):
|
||||
imputer = KNNImputer(n_neighbors=9, weights="uniform",
|
||||
metric="nan_euclidean")
|
||||
imputed_data = imputer.fit_transform(data_frame)
|
||||
return pd.DataFrame(imputed_data, columns=data_frame.columns)
|
||||
|
||||
|
||||
def remove_outliers(data_frame):
|
||||
for column in data_frame.columns:
|
||||
z_scores = stats.zscore(data_frame[column])
|
||||
threshold = 1.3412
|
||||
data_frame = data_frame[(z_scores < threshold) &
|
||||
(z_scores > -threshold)]
|
||||
return data_frame
|
||||
|
|
@ -0,0 +1,18 @@
|
|||
from setuptools import setup, find_packages
|
||||
|
||||
setup(
|
||||
name="barclays_credit_classifier",
|
||||
version="1.0.0",
|
||||
description="Predicts whether someone will default on their loan. Uses the Credit Risk Dataset from Kaggle",
|
||||
author="r0r-5chach",
|
||||
author_email="r0r-5chach.xyz@proton.me",
|
||||
packages=find_packages(),
|
||||
install_requires=[
|
||||
"numpy",
|
||||
"scipy",
|
||||
"matplotlib",
|
||||
"pandas",
|
||||
"scikit-learn",
|
||||
"seaborn"
|
||||
]
|
||||
)
|
||||
|
|
@ -0,0 +1,66 @@
|
|||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
|
||||
def missing_value_pairwise_plots(data_frame, null_column, save=False):
|
||||
not_missing_data = data_frame.dropna()
|
||||
mising_data = data_frame[data_frame[null_column].isnull()]
|
||||
for column in data_frame.columns:
|
||||
if column != null_column:
|
||||
plt.figure()
|
||||
plt.title(f"Scatter Plot of {column} against {null_column}")
|
||||
plot_missing_values(column)
|
||||
plt.scatter(not_missing_data[column], not_missing_data[null_column], color=[[0.502, 0, 0.502, 0.4]], label="Existing Values")
|
||||
plt.xlabel(column)
|
||||
plt.ylabel(null_column)
|
||||
plt.legend()
|
||||
if save:
|
||||
plt.savefig(f"missing_values[{column}:{null_column}].png")
|
||||
else:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
def plot_missing_values(column):
|
||||
plt.plot([], [], color="red", alpha=0.4, label="Missing Values")
|
||||
for value in column:
|
||||
plt.axvline(x=value, color="red", alpha=0.4)
|
||||
|
||||
def correlation_matrix(data_frame, save=False):
|
||||
matrix = data_frame.corr()
|
||||
plt.figure()
|
||||
sns.heatmap(matrix, annot=True)
|
||||
plt.title("Correlation Matrix of Existing Features")
|
||||
|
||||
if save:
|
||||
plt.savefig("correlation_matrix.png")
|
||||
else:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
def imputation_plots(data_frame, imputed_data, null_column, columns, save=False):
|
||||
not_missing_data = data_frame.dropna()
|
||||
for column in columns:
|
||||
plt.figure()
|
||||
plt.scatter(imputed_data[column], imputed_data[null_column], color=[[0, 0.502, 0, 0.4]], label="Imputed Data")
|
||||
plt.scatter(not_missing_data[column], not_missing_data[null_column], colot=[[0.502, 0, 0.502, 0,4]], label="Original Data")
|
||||
plt.title(f"Scatter Plot of {column} against {null_column} after KNN(9) Imputation")
|
||||
plt.xlabel(column)
|
||||
plt.ylabel(column)
|
||||
plt.legend()
|
||||
if save:
|
||||
plt.savefig(f"imputation_results[{column}:{null_column}].png")
|
||||
else:
|
||||
plt.show()
|
||||
plt.close()
|
||||
|
||||
def outlier_box_plots(data_frame, save=False):
|
||||
for column in data_frame.columns:
|
||||
plt.figure()
|
||||
plt.title(f"Box Plot of {column}")
|
||||
plt.boxplot(data_frame[column])
|
||||
plt.ylabel(column)
|
||||
plt.xticks(rotation=45)
|
||||
if save:
|
||||
plt.savefig(f"outlier_box_plot[{column}].png")
|
||||
else:
|
||||
plt.show()
|
||||
plt.close()
|
||||
Loading…
Reference in New Issue