import pandas as pd from scipy import stats from sklearn.impute import KNNImputer from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler def training_test_split(data_frame, target_variable): data_train, data_test = train_test_split(data_frame, train_size=0.75, random_state=79, shuffle=True) training_target = data_train[target_variable] test_target = data_test[target_variable] training_features = data_train.drop(target_variable, axis=1) test_features = data_test.drop(target_variable, axis=1) return training_features, training_target, test_features, test_target def normalise(training_features, test_features): scaler = MinMaxScaler().fit(training_features) return scaler.transform(training_features), scaler.transform(test_features) def impute_missing_values(data_frame): imputer = KNNImputer(n_neighbors=9, weights="uniform", metric="nan_euclidean") imputed_data = imputer.fit_transform(data_frame) return pd.DataFrame(imputed_data, columns=data_frame.columns) def remove_outliers(data_frame): for column in data_frame.columns: z_scores = stats.zscore(data_frame[column]) threshold = 1.3412 data_frame = data_frame[(z_scores < threshold) & (z_scores > -threshold)] return data_frame