37 lines
1.4 KiB
Python
37 lines
1.4 KiB
Python
import pandas as pd
|
|
from scipy import stats
|
|
from sklearn.impute import KNNImputer
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.preprocessing import MinMaxScaler
|
|
|
|
|
|
def training_test_split(data_frame, target_variable):
|
|
data_train, data_test = train_test_split(data_frame, train_size=0.75,
|
|
random_state=79, shuffle=True)
|
|
training_target = data_train[target_variable]
|
|
test_target = data_test[target_variable]
|
|
training_features = data_train.drop(target_variable, axis=1)
|
|
test_features = data_test.drop(target_variable, axis=1)
|
|
return training_features, training_target, test_features, test_target
|
|
|
|
|
|
def normalise(training_features, test_features):
|
|
scaler = MinMaxScaler().fit(training_features)
|
|
return scaler.transform(training_features), scaler.transform(test_features)
|
|
|
|
|
|
def impute_missing_values(data_frame):
|
|
imputer = KNNImputer(n_neighbors=9, weights="uniform",
|
|
metric="nan_euclidean")
|
|
imputed_data = imputer.fit_transform(data_frame)
|
|
return pd.DataFrame(imputed_data, columns=data_frame.columns)
|
|
|
|
|
|
def remove_outliers(data_frame):
|
|
for column in data_frame.columns:
|
|
z_scores = stats.zscore(data_frame[column])
|
|
threshold = 1.3412
|
|
data_frame = data_frame[(z_scores < threshold) &
|
|
(z_scores > -threshold)]
|
|
return data_frame
|