my_df my_df_cols = my_df.columns print("count of NULL values before imputation\n") my_df.isnull().sum() sel_cols = ['or_mychisq', 'log10_or_mychisq', 'or_fisher'] or_cols = ['or_mychisq', 'log10_or_mychisq'] print("count of NULL values before imputation\n") my_df[or_cols].isnull().sum() df = my_df[sel_cols] my_df2 = df[sel_cols] my_df2.isna().sum() my_df2['or_mychisq'].value_counts().plot(kind = 'hist') my_df2['or_mychisq'].value_counts().plot(kind = 'density') my_df2['log10_or_mychisq'].value_counts().plot(kind = 'hist') my_df2['log10_or_mychisq'].value_counts().plot(kind = 'density') #my_df2['or_fisher'].value_counts().plot(kind = 'hist') #my_df2['or_fisher'].value_counts().plot(kind = 'density') #%% #missing_col = ['or_mychisq'] #Technique 2: Using median to impute the missing values #for i in missing_col: for i in or_cols: my_df2.loc[my_df2.loc[:,i].isnull(),i]=my_df2.loc[:,i].median() print("count of NULL values after imputation\n") my_df2.isnull().sum() from sklearn.impute import KNNImputer as KNN #my_df3 = pd.DataFrame(KNN(n_neighbors= 5, weights="uniform").fit_transform(df), columns = or_cols) #keeps the col names my_df3 = pd.DataFrame(KNN(n_neighbors= 5, weights="uniform").fit_transform(df[or_cols])) my_df3.columns = or_cols imputer = KNNImputer(n_neighbors=2, weights="uniform") my_df4 = pd.DataFrame(imputer.fit_transform(df[or_cols]), columns = or_cols) #all(my_df3==my_df4) my_df3['log10_or_mychisq'].value_counts().plot(kind = 'hist') my_df3['log10_or_mychisq'].value_counts().plot(kind = 'density') print('\nRaw values:\n', df[or_cols].describe()) print('\nMedian imputed values:\n', my_df2[or_cols].describe()) print('\KNN imputed values:\n', my_df3[or_cols].describe()) my_df4.describe() my_df2.plot.scatter(x='or_mychisq', y = 'log10_or_mychisq', s = 100) my_df3.plot.scatter(x='or_mychisq', y = 'log10_or_mychisq', s = 100)