end of challenge

2024-04-07 14:41:14 +01:00 · 2024-04-07 14:41:14 +01:00 · 26968d7545
parent ef79c30e8c
commit 26968d7545
9 changed files with 65531 additions and 79 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,5 @@
 /target
+__pycache__
+build
+credit_risk_imputation.egg-info
+.env
--- a/Cargo.lock
+++ b/Cargo.lock
@ -45,6 +45,15 @@ dependencies = [
 "libc",
 ]

+[[package]]
+name = "approx"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f2a05fd1bd10b2527e20a2cd32d8873d115b8b39fe219ee25f42a8aca6ba278"
+dependencies = [
+ "num-traits",
+]
+
 [[package]]
 name = "approx"
 version = "0.5.1"
@ -94,7 +103,7 @@ checksum = "f1fdabc7756949593fe60f30ec81974b613357de856987752631dea1e3394c80"
 name = "barclays"
 version = "0.1.0"
 dependencies = [
- "lazy_static",
+ "linfa-preprocessing",
 "ndarray",
 "polars",
 "smartcore",
@ -295,6 +304,70 @@ version = "1.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "11157ac094ffbdde99aa67b23417ebdd801842852b500e395a45a9c0aac03e4a"

+[[package]]
+name = "encoding"
+version = "0.2.33"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec"
+dependencies = [
+ "encoding-index-japanese",
+ "encoding-index-korean",
+ "encoding-index-simpchinese",
+ "encoding-index-singlebyte",
+ "encoding-index-tradchinese",
+]
+
+[[package]]
+name = "encoding-index-japanese"
+version = "1.20141219.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91"
+dependencies = [
+ "encoding_index_tests",
+]
+
+[[package]]
+name = "encoding-index-korean"
+version = "1.20141219.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81"
+dependencies = [
+ "encoding_index_tests",
+]
+
+[[package]]
+name = "encoding-index-simpchinese"
+version = "1.20141219.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d87a7194909b9118fc707194baa434a4e3b0fb6a5a757c73c3adb07aa25031f7"
+dependencies = [
+ "encoding_index_tests",
+]
+
+[[package]]
+name = "encoding-index-singlebyte"
+version = "1.20141219.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a"
+dependencies = [
+ "encoding_index_tests",
+]
+
+[[package]]
+name = "encoding-index-tradchinese"
+version = "1.20141219.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18"
+dependencies = [
+ "encoding_index_tests",
+]
+
+[[package]]
+name = "encoding_index_tests"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569"
+
 [[package]]
 name = "enum_dispatch"
 version = "0.3.13"
@ -356,6 +429,12 @@ version = "0.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"

+[[package]]
+name = "hashbrown"
+version = "0.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
+
 [[package]]
 name = "hashbrown"
 version = "0.14.3"
@ -411,6 +490,16 @@ dependencies = [
 "cc",
 ]

+[[package]]
+name = "indexmap"
+version = "1.9.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99"
+dependencies = [
+ "autocfg",
+ "hashbrown 0.12.3",
+]
+
 [[package]]
 name = "indexmap"
 version = "2.2.6"
@ -418,7 +507,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26"
 dependencies = [
 "equivalent",
- "hashbrown",
+ "hashbrown 0.14.3",
+]
+
+[[package]]
+name = "itertools"
+version = "0.10.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473"
+dependencies = [
+ "either",
 ]

 [[package]]
@ -451,12 +549,6 @@ dependencies = [
 "wasm-bindgen",
 ]

-[[package]]
-name = "lazy_static"
-version = "1.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
-
 [[package]]
 name = "libc"
 version = "0.2.153"
@ -469,6 +561,50 @@ version = "0.2.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"

+[[package]]
+name = "linfa"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1cab423110bc374e4cfa915da88952e2c6a4a5a6300ac0a0e68022bff2ace0b3"
+dependencies = [
+ "approx 0.4.0",
+ "ndarray",
+ "num-traits",
+ "rand",
+ "sprs",
+ "thiserror",
+]
+
+[[package]]
+name = "linfa-linalg"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "56e7562b41c8876d3367897067013bb2884cc78e6893f092ecd26b305176ac82"
+dependencies = [
+ "ndarray",
+ "num-traits",
+ "thiserror",
+]
+
+[[package]]
+name = "linfa-preprocessing"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9de00d503ab60e12428b77abeff006d1c4f1dba9d962bbd72bca591706c0a8ff"
+dependencies = [
+ "approx 0.4.0",
+ "encoding",
+ "linfa",
+ "linfa-linalg",
+ "ndarray",
+ "ndarray-rand",
+ "ndarray-stats",
+ "regex",
+ "sprs",
+ "thiserror",
+ "unicode-normalization",
+]
+
 [[package]]
 name = "lock_api"
 version = "0.4.11"
@ -558,6 +694,7 @@ version = "0.15.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "adb12d4e967ec485a5f71c6311fe28158e9d6f4bc4a447b474184d0f91a8fa32"
 dependencies = [
+ "approx 0.4.0",
 "matrixmultiply",
 "num-complex",
 "num-integer",
@ -565,6 +702,41 @@ dependencies = [
 "rawpointer",
 ]

+[[package]]
+name = "ndarray-rand"
+version = "0.14.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "65608f937acc725f5b164dcf40f4f0bc5d67dc268ab8a649d3002606718c4588"
+dependencies = [
+ "ndarray",
+ "rand",
+ "rand_distr",
+]
+
+[[package]]
+name = "ndarray-stats"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "af5a8477ac96877b5bd1fd67e0c28736c12943aba24eda92b127e036b0c8f400"
+dependencies = [
+ "indexmap 1.9.3",
+ "itertools",
+ "ndarray",
+ "noisy_float",
+ "num-integer",
+ "num-traits",
+ "rand",
+]
+
+[[package]]
+name = "noisy_float"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "978fe6e6ebc0bf53de533cd456ca2d9de13de13856eda1518a285d7705a213af"
+dependencies = [
+ "num-traits",
+]
+
 [[package]]
 name = "now"
 version = "0.1.3"
@ -800,7 +972,7 @@ dependencies = [
 "fast-float",
 "foreign_vec",
 "getrandom",
- "hashbrown",
+ "hashbrown 0.14.3",
 "itoa",
 "itoap",
 "lz4",
@ -856,8 +1028,8 @@ dependencies = [
 "chrono-tz",
 "comfy-table",
 "either",
- "hashbrown",
- "indexmap",
+ "hashbrown 0.14.3",
+ "indexmap 2.2.6",
 "ndarray",
 "num-traits",
 "once_cell",
@ -954,9 +1126,9 @@ dependencies = [
 "chrono",
 "chrono-tz",
 "either",
- "hashbrown",
+ "hashbrown 0.14.3",
 "hex",
- "indexmap",
+ "indexmap 2.2.6",
 "memchr",
 "num-traits",
 "polars-arrow",
@ -999,7 +1171,7 @@ dependencies = [
 "crossbeam-channel",
 "crossbeam-queue",
 "enum_dispatch",
- "hashbrown",
+ "hashbrown 0.14.3",
 "num-traits",
 "polars-arrow",
 "polars-compute",
@ -1097,8 +1269,8 @@ checksum = "694656a7d2b0cd8f07660dbc8d0fb7a81066ff57a452264907531d805c1e58c4"
 dependencies = [
 "ahash",
 "bytemuck",
- "hashbrown",
- "indexmap",
+ "hashbrown 0.14.3",
+ "indexmap 2.2.6",
 "num-traits",
 "once_cell",
 "polars-error",
@ -1325,7 +1497,7 @@ version = "0.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c42ca1fcd851ada8834d3dfcd088850dc8c703bde50c2baccd89181b74dc3ade"
 dependencies = [
- "approx",
+ "approx 0.5.1",
 "cfg-if",
 "ndarray",
 "num",
@ -1344,6 +1516,18 @@ dependencies = [
 "version_check",
 ]

+[[package]]
+name = "sprs"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "88bab60b0a18fb9b3e0c26e92796b3c3a278bf5fa4880f5ad5cc3bdfb843d0b1"
+dependencies = [
+ "ndarray",
+ "num-complex",
+ "num-traits",
+ "smallvec",
+]
+
 [[package]]
 name = "sqlparser"
 version = "0.39.0"
@ -1461,12 +1645,36 @@ dependencies = [
 "syn 2.0.58",
 ]

+[[package]]
+name = "tinyvec"
+version = "1.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50"
+dependencies = [
+ "tinyvec_macros",
+]
+
+[[package]]
+name = "tinyvec_macros"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
+
 [[package]]
 name = "unicode-ident"
 version = "1.0.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"

+[[package]]
+name = "unicode-normalization"
+version = "0.1.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5"
+dependencies = [
+ "tinyvec",
+]
+
 [[package]]
 name = "unicode-reverse"
 version = "1.0.9"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -6,7 +6,7 @@ edition = "2021"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

 [dependencies]
-lazy_static = "1.4.0"
+linfa-preprocessing = "0.7.0"
 ndarray = "0.15.6"
 polars = { version = "0.38.3", features = ["ndarray"] }
 smartcore = { version = "0.3.2", features = ["ndarray-bindings"] }
--- a/credit_risk_dataset.csv
+++ b/credit_risk_dataset.csv
--- a/imputed_data.csv
+++ b/imputed_data.csv
--- a/missing_values.py
+++ b/missing_values.py
@ -0,0 +1,40 @@
+import pandas as pd
+from sklearn.impute import KNNImputer
+
+dataframe = pd.read_csv("credit_risk_dataset.csv")
+
+# Feature Conversion Values
+person_home_ownership_values = {
+    "RENT": 1,
+    "MORTGAGE": 2,
+    "OWN": 3,
+    "OTHER": 4,
+}
+loan_intent_values = {
+    "EDUCATIONAL": 1,
+    "MEDICAL": 2,
+    "VENTURE": 3,
+    "PERSONAL": 4,
+    "DEBTCONSOLIDATION": 5
+    }
+loan_grade_values = {
+    "A": 1,
+    "B": 2,
+    "C": 3,
+    "D": 4,
+    "E": 5
+}
+cb_person_default_on_file_values = {
+    "Y": 1,
+    "N": 0,
+}
+
+dataframe["person_home_ownership"] = dataframe["person_home_ownership"].map(person_home_ownership_values)
+dataframe["loan_intent"] = dataframe["loan_intent"].map(loan_intent_values)
+dataframe["loan_grade"] = dataframe["loan_grade"].map(loan_grade_values)
+dataframe["cb_person_default_on_file"] = dataframe["cb_person_default_on_file"].map(cb_person_default_on_file_values)
+
+imputer = KNNImputer(n_neighbors=9, weights="uniform", metric="nan_euclidean")
+imputed_data = imputer.fit_transform(dataframe)
+pd.DataFrame(imputed_data,
+             columns=dataframe.columns).to_csv("imputed_data.csv", index=False)
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,15 @@
+from setuptools import setup, find_packages
+
+setup(
+        name="credit_risk_imputation",
+        version="1.0.0",
+        description="""
+Credit Risk Assessment Dataset Cleaning using KNN Imputation""",
+        author="r0r-5chach",
+        author_email="r0r-5chach.xyz@proton.me",
+        packages=find_packages(),
+        install_requires=[
+            "pandas",
+            "scikit-learn"
+            ]
+        )
--- a/src/model.rs
+++ b/src/model.rs
@ -1,53 +1,13 @@
-pub mod preprocessing;
+mod preprocessing;

-use std::collections::HashMap;
-use lazy_static::lazy_static;
-use polars::prelude::*;
-use smartcore::linalg::basic::matrix::DenseMatrix;
+use linfa_preprocessing::linear_scaling::LinearScaler;
+use preprocessing::SplitData;
+use smartcore::ensemble::random_forest_classifier::{RandomForestClassifier, RandomForestClassifierParameters};

-lazy_static! {
-    static ref CATEGORICAL_COLUMNS: Vec<&'static str> = vec![
-        "person_home_ownership", 
-        "loan_intent", 
-        "loan_grade", 
-        "cb_person_default_on_file",
-    ];
-    
-    static ref HOME_OWNERSHIP_VALUES: HashMap<&'static str, u32> = HashMap::from([
-        ("RENT", 1),
-        ("MORTGAGE", 2),
-        ("OWN", 3),
-        ("OTHER", 4),
-    ]);
-
-    static ref INTENT_VALUES: HashMap<&'static str, u32> = HashMap::from([
-        ("EDUCATIONAL", 1),
-        ("MEDICAL", 2),
-        ("VENTURE", 3),
-        ("PERSONAL", 4),
-        ("DEBTCONSOLIDATION", 5),
-    ]);
-
-    static ref GRADE_VALUES: HashMap<&'static str, u32> = HashMap::from([
-        ("A", 1),
-        ("B", 2),
-        ("C", 3),
-        ("D", 4),
-        ("E", 5),
-    ]);
-
-    static ref DEFAULT_FILE_VALUES: HashMap<&'static str, u32> = HashMap::from([
-        ("Y", 1),
-        ("N", 2),
-    ]);
-}
-
-pub fn init_default_predictor() {
-    let data = CsvReader::from_path("credit_risk_dataset.csv")
-        .unwrap()
-        .finish()
-        .unwrap()
-        .to_ndarray::<Float64Type>(IndexOrder::Fortran)
-        .unwrap();
+pub fn init_classifier() {
+    let split_data = SplitData::new();
+    let features = split_data.features;
+    let target = split_data.target;

+    let classifier = RandomForestClassifier::fit(&features.train, &target.train, Default::default()).unwrap();
 }
--- a/src/model/preprocessing.rs
+++ b/src/model/preprocessing.rs
@ -1,19 +1,80 @@
-use smartcore::{metrics::distance::euclidian::Euclidian, neighbors::knn_regressor::{KNNRegressor, KNNRegressorParameters}};
-use ndarray::{Array1,Array2};

-pub struct KNNImputer {
-    model: KNNRegressor<f64,f64,Array2<f64>,Array1<f64>,Euclidian<f64>>
+use linfa_preprocessing::linear_scaling::{LinearScaler, LinearScalerParams};
+use ndarray::{Array1,Array2, Axis};
+use polars::prelude::*;
+use smartcore::model_selection::train_test_split;
+
+const FILE: &str = "imputed_data.csv";
+const TARGET: &str = "loan_status";
+
+struct Data {
+    features: Array2<f64>,
+    target: Array1<f64>,
+    split_data: Option<SplitData>,
 }
+impl Data {
+    fn new() -> Self  {
+        let dataframe = import_data();
+        let target_index = dataframe.get_column_index(TARGET).unwrap();

-impl KNNImputer {
-    pub fn new(dataframe: ndarray::Array2<f64>, target_column: usize) -> Self {
-            let true_values: Array1<f64> = dataframe.column(target_column).to_vec().into();
-            KNNImputer {
-             model: KNNRegressor::fit(&dataframe, &true_values.into(), Default::default()).unwrap(),
+        let mut dataframe = dataframe.to_ndarray::<Float64Type>(IndexOrder::Fortran).unwrap();
+        
+        let target = dataframe.index_axis(Axis(1), target_index).to_owned();
+        dataframe.remove_index(Axis(1), target_index);
+
+        Data {
+            features: dataframe,
+            target,
+            split_data: None,
        }
    }

-    pub fn impute(&self) -> bool {
-        //TODO: Predict value, 
+    fn split_data(&self) -> SplitData {
+        let (features_train, features_test, 
+             target_train, target_test) = train_test_split(&self.features, &self.target, 
+                                                           0.75, true, Some(79));
+
+
+        let features = Features{
+            train: scaler.transform(features_train), 
+            test: scaler.transform(features_test)
+        };
+        let target = Target{
+            train: scaler.transform(target_train), 
+            test: scaler.transform(target_test)
+        };
+
+        SplitData {
+            features,
+            target,
        }
    }
+}
+
+pub struct SplitData {
+    pub features: Features,
+    pub target:Target,
+}
+impl SplitData {
+    pub fn new() -> Self {
+        let data = Data::new();
+        data.split_data()
+    }
+}
+
+pub struct Features {
+    pub train: Array2<f64>,
+    pub test: Array2<f64>,
+}
+
+pub struct Target {
+    pub train: Array1<f64>,
+    pub test: Array1<f64>,
+}
+
+fn import_data() -> DataFrame {
+    CsvReader::from_path(FILE)
+        .unwrap()
+        .finish()
+        .unwrap()
+}