end of challenge

This commit is contained in:
Joshua Perry 2024-04-07 14:41:14 +01:00
parent ef79c30e8c
commit 26968d7545
9 changed files with 65531 additions and 79 deletions

4
.gitignore vendored
View File

@ -1 +1,5 @@
/target
__pycache__
build
credit_risk_imputation.egg-info
.env

242
Cargo.lock generated
View File

@ -45,6 +45,15 @@ dependencies = [
"libc",
]
[[package]]
name = "approx"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f2a05fd1bd10b2527e20a2cd32d8873d115b8b39fe219ee25f42a8aca6ba278"
dependencies = [
"num-traits",
]
[[package]]
name = "approx"
version = "0.5.1"
@ -94,7 +103,7 @@ checksum = "f1fdabc7756949593fe60f30ec81974b613357de856987752631dea1e3394c80"
name = "barclays"
version = "0.1.0"
dependencies = [
"lazy_static",
"linfa-preprocessing",
"ndarray",
"polars",
"smartcore",
@ -295,6 +304,70 @@ version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "11157ac094ffbdde99aa67b23417ebdd801842852b500e395a45a9c0aac03e4a"
[[package]]
name = "encoding"
version = "0.2.33"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec"
dependencies = [
"encoding-index-japanese",
"encoding-index-korean",
"encoding-index-simpchinese",
"encoding-index-singlebyte",
"encoding-index-tradchinese",
]
[[package]]
name = "encoding-index-japanese"
version = "1.20141219.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91"
dependencies = [
"encoding_index_tests",
]
[[package]]
name = "encoding-index-korean"
version = "1.20141219.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81"
dependencies = [
"encoding_index_tests",
]
[[package]]
name = "encoding-index-simpchinese"
version = "1.20141219.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d87a7194909b9118fc707194baa434a4e3b0fb6a5a757c73c3adb07aa25031f7"
dependencies = [
"encoding_index_tests",
]
[[package]]
name = "encoding-index-singlebyte"
version = "1.20141219.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a"
dependencies = [
"encoding_index_tests",
]
[[package]]
name = "encoding-index-tradchinese"
version = "1.20141219.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18"
dependencies = [
"encoding_index_tests",
]
[[package]]
name = "encoding_index_tests"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569"
[[package]]
name = "enum_dispatch"
version = "0.3.13"
@ -356,6 +429,12 @@ version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
[[package]]
name = "hashbrown"
version = "0.12.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
[[package]]
name = "hashbrown"
version = "0.14.3"
@ -411,6 +490,16 @@ dependencies = [
"cc",
]
[[package]]
name = "indexmap"
version = "1.9.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99"
dependencies = [
"autocfg",
"hashbrown 0.12.3",
]
[[package]]
name = "indexmap"
version = "2.2.6"
@ -418,7 +507,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26"
dependencies = [
"equivalent",
"hashbrown",
"hashbrown 0.14.3",
]
[[package]]
name = "itertools"
version = "0.10.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473"
dependencies = [
"either",
]
[[package]]
@ -451,12 +549,6 @@ dependencies = [
"wasm-bindgen",
]
[[package]]
name = "lazy_static"
version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
[[package]]
name = "libc"
version = "0.2.153"
@ -469,6 +561,50 @@ version = "0.2.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
[[package]]
name = "linfa"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1cab423110bc374e4cfa915da88952e2c6a4a5a6300ac0a0e68022bff2ace0b3"
dependencies = [
"approx 0.4.0",
"ndarray",
"num-traits",
"rand",
"sprs",
"thiserror",
]
[[package]]
name = "linfa-linalg"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "56e7562b41c8876d3367897067013bb2884cc78e6893f092ecd26b305176ac82"
dependencies = [
"ndarray",
"num-traits",
"thiserror",
]
[[package]]
name = "linfa-preprocessing"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9de00d503ab60e12428b77abeff006d1c4f1dba9d962bbd72bca591706c0a8ff"
dependencies = [
"approx 0.4.0",
"encoding",
"linfa",
"linfa-linalg",
"ndarray",
"ndarray-rand",
"ndarray-stats",
"regex",
"sprs",
"thiserror",
"unicode-normalization",
]
[[package]]
name = "lock_api"
version = "0.4.11"
@ -558,6 +694,7 @@ version = "0.15.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "adb12d4e967ec485a5f71c6311fe28158e9d6f4bc4a447b474184d0f91a8fa32"
dependencies = [
"approx 0.4.0",
"matrixmultiply",
"num-complex",
"num-integer",
@ -565,6 +702,41 @@ dependencies = [
"rawpointer",
]
[[package]]
name = "ndarray-rand"
version = "0.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "65608f937acc725f5b164dcf40f4f0bc5d67dc268ab8a649d3002606718c4588"
dependencies = [
"ndarray",
"rand",
"rand_distr",
]
[[package]]
name = "ndarray-stats"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "af5a8477ac96877b5bd1fd67e0c28736c12943aba24eda92b127e036b0c8f400"
dependencies = [
"indexmap 1.9.3",
"itertools",
"ndarray",
"noisy_float",
"num-integer",
"num-traits",
"rand",
]
[[package]]
name = "noisy_float"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "978fe6e6ebc0bf53de533cd456ca2d9de13de13856eda1518a285d7705a213af"
dependencies = [
"num-traits",
]
[[package]]
name = "now"
version = "0.1.3"
@ -800,7 +972,7 @@ dependencies = [
"fast-float",
"foreign_vec",
"getrandom",
"hashbrown",
"hashbrown 0.14.3",
"itoa",
"itoap",
"lz4",
@ -856,8 +1028,8 @@ dependencies = [
"chrono-tz",
"comfy-table",
"either",
"hashbrown",
"indexmap",
"hashbrown 0.14.3",
"indexmap 2.2.6",
"ndarray",
"num-traits",
"once_cell",
@ -954,9 +1126,9 @@ dependencies = [
"chrono",
"chrono-tz",
"either",
"hashbrown",
"hashbrown 0.14.3",
"hex",
"indexmap",
"indexmap 2.2.6",
"memchr",
"num-traits",
"polars-arrow",
@ -999,7 +1171,7 @@ dependencies = [
"crossbeam-channel",
"crossbeam-queue",
"enum_dispatch",
"hashbrown",
"hashbrown 0.14.3",
"num-traits",
"polars-arrow",
"polars-compute",
@ -1097,8 +1269,8 @@ checksum = "694656a7d2b0cd8f07660dbc8d0fb7a81066ff57a452264907531d805c1e58c4"
dependencies = [
"ahash",
"bytemuck",
"hashbrown",
"indexmap",
"hashbrown 0.14.3",
"indexmap 2.2.6",
"num-traits",
"once_cell",
"polars-error",
@ -1325,7 +1497,7 @@ version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c42ca1fcd851ada8834d3dfcd088850dc8c703bde50c2baccd89181b74dc3ade"
dependencies = [
"approx",
"approx 0.5.1",
"cfg-if",
"ndarray",
"num",
@ -1344,6 +1516,18 @@ dependencies = [
"version_check",
]
[[package]]
name = "sprs"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "88bab60b0a18fb9b3e0c26e92796b3c3a278bf5fa4880f5ad5cc3bdfb843d0b1"
dependencies = [
"ndarray",
"num-complex",
"num-traits",
"smallvec",
]
[[package]]
name = "sqlparser"
version = "0.39.0"
@ -1461,12 +1645,36 @@ dependencies = [
"syn 2.0.58",
]
[[package]]
name = "tinyvec"
version = "1.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50"
dependencies = [
"tinyvec_macros",
]
[[package]]
name = "tinyvec_macros"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
[[package]]
name = "unicode-ident"
version = "1.0.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
[[package]]
name = "unicode-normalization"
version = "0.1.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5"
dependencies = [
"tinyvec",
]
[[package]]
name = "unicode-reverse"
version = "1.0.9"

View File

@ -6,7 +6,7 @@ edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
lazy_static = "1.4.0"
linfa-preprocessing = "0.7.0"
ndarray = "0.15.6"
polars = { version = "0.38.3", features = ["ndarray"] }
smartcore = { version = "0.3.2", features = ["ndarray-bindings"] }

32582
credit_risk_dataset.csv Normal file

File diff suppressed because it is too large Load Diff

32582
imputed_data.csv Normal file

File diff suppressed because it is too large Load Diff

40
missing_values.py Normal file
View File

@ -0,0 +1,40 @@
import pandas as pd
from sklearn.impute import KNNImputer
dataframe = pd.read_csv("credit_risk_dataset.csv")
# Feature Conversion Values
person_home_ownership_values = {
"RENT": 1,
"MORTGAGE": 2,
"OWN": 3,
"OTHER": 4,
}
loan_intent_values = {
"EDUCATIONAL": 1,
"MEDICAL": 2,
"VENTURE": 3,
"PERSONAL": 4,
"DEBTCONSOLIDATION": 5
}
loan_grade_values = {
"A": 1,
"B": 2,
"C": 3,
"D": 4,
"E": 5
}
cb_person_default_on_file_values = {
"Y": 1,
"N": 0,
}
dataframe["person_home_ownership"] = dataframe["person_home_ownership"].map(person_home_ownership_values)
dataframe["loan_intent"] = dataframe["loan_intent"].map(loan_intent_values)
dataframe["loan_grade"] = dataframe["loan_grade"].map(loan_grade_values)
dataframe["cb_person_default_on_file"] = dataframe["cb_person_default_on_file"].map(cb_person_default_on_file_values)
imputer = KNNImputer(n_neighbors=9, weights="uniform", metric="nan_euclidean")
imputed_data = imputer.fit_transform(dataframe)
pd.DataFrame(imputed_data,
columns=dataframe.columns).to_csv("imputed_data.csv", index=False)

15
setup.py Normal file
View File

@ -0,0 +1,15 @@
from setuptools import setup, find_packages
setup(
name="credit_risk_imputation",
version="1.0.0",
description="""
Credit Risk Assessment Dataset Cleaning using KNN Imputation""",
author="r0r-5chach",
author_email="r0r-5chach.xyz@proton.me",
packages=find_packages(),
install_requires=[
"pandas",
"scikit-learn"
]
)

View File

@ -1,53 +1,13 @@
pub mod preprocessing;
mod preprocessing;
use std::collections::HashMap;
use lazy_static::lazy_static;
use polars::prelude::*;
use smartcore::linalg::basic::matrix::DenseMatrix;
use linfa_preprocessing::linear_scaling::LinearScaler;
use preprocessing::SplitData;
use smartcore::ensemble::random_forest_classifier::{RandomForestClassifier, RandomForestClassifierParameters};
lazy_static! {
static ref CATEGORICAL_COLUMNS: Vec<&'static str> = vec![
"person_home_ownership",
"loan_intent",
"loan_grade",
"cb_person_default_on_file",
];
static ref HOME_OWNERSHIP_VALUES: HashMap<&'static str, u32> = HashMap::from([
("RENT", 1),
("MORTGAGE", 2),
("OWN", 3),
("OTHER", 4),
]);
static ref INTENT_VALUES: HashMap<&'static str, u32> = HashMap::from([
("EDUCATIONAL", 1),
("MEDICAL", 2),
("VENTURE", 3),
("PERSONAL", 4),
("DEBTCONSOLIDATION", 5),
]);
static ref GRADE_VALUES: HashMap<&'static str, u32> = HashMap::from([
("A", 1),
("B", 2),
("C", 3),
("D", 4),
("E", 5),
]);
static ref DEFAULT_FILE_VALUES: HashMap<&'static str, u32> = HashMap::from([
("Y", 1),
("N", 2),
]);
}
pub fn init_default_predictor() {
let data = CsvReader::from_path("credit_risk_dataset.csv")
.unwrap()
.finish()
.unwrap()
.to_ndarray::<Float64Type>(IndexOrder::Fortran)
.unwrap();
pub fn init_classifier() {
let split_data = SplitData::new();
let features = split_data.features;
let target = split_data.target;
let classifier = RandomForestClassifier::fit(&features.train, &target.train, Default::default()).unwrap();
}

View File

@ -1,19 +1,80 @@
use smartcore::{metrics::distance::euclidian::Euclidian, neighbors::knn_regressor::{KNNRegressor, KNNRegressorParameters}};
use ndarray::{Array1,Array2};
pub struct KNNImputer {
model: KNNRegressor<f64,f64,Array2<f64>,Array1<f64>,Euclidian<f64>>
use linfa_preprocessing::linear_scaling::{LinearScaler, LinearScalerParams};
use ndarray::{Array1,Array2, Axis};
use polars::prelude::*;
use smartcore::model_selection::train_test_split;
const FILE: &str = "imputed_data.csv";
const TARGET: &str = "loan_status";
struct Data {
features: Array2<f64>,
target: Array1<f64>,
split_data: Option<SplitData>,
}
impl Data {
fn new() -> Self {
let dataframe = import_data();
let target_index = dataframe.get_column_index(TARGET).unwrap();
impl KNNImputer {
pub fn new(dataframe: ndarray::Array2<f64>, target_column: usize) -> Self {
let true_values: Array1<f64> = dataframe.column(target_column).to_vec().into();
KNNImputer {
model: KNNRegressor::fit(&dataframe, &true_values.into(), Default::default()).unwrap(),
let mut dataframe = dataframe.to_ndarray::<Float64Type>(IndexOrder::Fortran).unwrap();
let target = dataframe.index_axis(Axis(1), target_index).to_owned();
dataframe.remove_index(Axis(1), target_index);
Data {
features: dataframe,
target,
split_data: None,
}
}
pub fn impute(&self) -> bool {
//TODO: Predict value,
fn split_data(&self) -> SplitData {
let (features_train, features_test,
target_train, target_test) = train_test_split(&self.features, &self.target,
0.75, true, Some(79));
let features = Features{
train: scaler.transform(features_train),
test: scaler.transform(features_test)
};
let target = Target{
train: scaler.transform(target_train),
test: scaler.transform(target_test)
};
SplitData {
features,
target,
}
}
}
pub struct SplitData {
pub features: Features,
pub target:Target,
}
impl SplitData {
pub fn new() -> Self {
let data = Data::new();
data.split_data()
}
}
pub struct Features {
pub train: Array2<f64>,
pub test: Array2<f64>,
}
pub struct Target {
pub train: Array1<f64>,
pub test: Array1<f64>,
}
fn import_data() -> DataFrame {
CsvReader::from_path(FILE)
.unwrap()
.finish()
.unwrap()
}