ML_AI_training/UQ_or_impute.py

my_df
my_df_cols = my_df.columns
print("count of NULL values before imputation\n")
my_df.isnull().sum()


sel_cols = ['or_mychisq', 'log10_or_mychisq', 'or_fisher']
or_cols = ['or_mychisq', 'log10_or_mychisq']

print("count of NULL values before imputation\n")
my_df[or_cols].isnull().sum()

df = my_df[sel_cols]
my_df2 = df[sel_cols]
my_df2.isna().sum()

my_df2['or_mychisq'].value_counts().plot(kind = 'hist')
my_df2['or_mychisq'].value_counts().plot(kind = 'density')

my_df2['log10_or_mychisq'].value_counts().plot(kind = 'hist')
my_df2['log10_or_mychisq'].value_counts().plot(kind = 'density')

#my_df2['or_fisher'].value_counts().plot(kind = 'hist')
#my_df2['or_fisher'].value_counts().plot(kind = 'density')
#%%
#missing_col = ['or_mychisq']

#Technique 2: Using median to impute the missing values
#for i in missing_col:
for i in or_cols:
 my_df2.loc[my_df2.loc[:,i].isnull(),i]=my_df2.loc[:,i].median()

print("count of NULL values after imputation\n")
my_df2.isnull().sum()

from sklearn.impute import KNNImputer as KNN
#my_df3 = pd.DataFrame(KNN(n_neighbors= 5, weights="uniform").fit_transform(df), columns = or_cols) #keeps the col names
my_df3 = pd.DataFrame(KNN(n_neighbors= 5, weights="uniform").fit_transform(df[or_cols]))
my_df3.columns = or_cols

imputer = KNNImputer(n_neighbors=2, weights="uniform")
my_df4  = pd.DataFrame(imputer.fit_transform(df[or_cols]), columns = or_cols)

#all(my_df3==my_df4)
my_df3['log10_or_mychisq'].value_counts().plot(kind = 'hist')
my_df3['log10_or_mychisq'].value_counts().plot(kind = 'density')

print('\nRaw values:\n', df[or_cols].describe())
print('\nMedian imputed values:\n', my_df2[or_cols].describe())
print('\KNN imputed values:\n', my_df3[or_cols].describe())
my_df4.describe()

my_df2.plot.scatter(x='or_mychisq', y = 'log10_or_mychisq', s = 100)
my_df3.plot.scatter(x='or_mychisq', y = 'log10_or_mychisq', s = 100)