barclays_challenge_event_20.../model/preprocessing.py

37 lines
1.4 KiB
Python
Raw Normal View History

2024-03-29 16:05:46 +00:00
import pandas as pd
from scipy import stats
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
def training_test_split(data_frame, target_variable):
data_train, data_test = train_test_split(data_frame, train_size=0.75,
random_state=79, shuffle=True)
training_target = data_train[target_variable]
test_target = data_test[target_variable]
training_features = data_train.drop(target_variable, axis=1)
test_features = data_test.drop(target_variable, axis=1)
return training_features, training_target, test_features, test_target
def normalise(training_features, test_features):
scaler = MinMaxScaler().fit(training_features)
return scaler.transform(training_features), scaler.transform(test_features)
def impute_missing_values(data_frame):
imputer = KNNImputer(n_neighbors=9, weights="uniform",
metric="nan_euclidean")
imputed_data = imputer.fit_transform(data_frame)
return pd.DataFrame(imputed_data, columns=data_frame.columns)
def remove_outliers(data_frame):
for column in data_frame.columns:
z_scores = stats.zscore(data_frame[column])
threshold = 1.3412
data_frame = data_frame[(z_scores < threshold) &
(z_scores > -threshold)]
return data_frame