55 lines
1.8 KiB
Python
Executable file
55 lines
1.8 KiB
Python
Executable file
my_df
|
|
my_df_cols = my_df.columns
|
|
print("count of NULL values before imputation\n")
|
|
my_df.isnull().sum()
|
|
|
|
|
|
sel_cols = ['or_mychisq', 'log10_or_mychisq', 'or_fisher']
|
|
or_cols = ['or_mychisq', 'log10_or_mychisq']
|
|
|
|
print("count of NULL values before imputation\n")
|
|
my_df[or_cols].isnull().sum()
|
|
|
|
df = my_df[sel_cols]
|
|
my_df2 = df[sel_cols]
|
|
my_df2.isna().sum()
|
|
|
|
my_df2['or_mychisq'].value_counts().plot(kind = 'hist')
|
|
my_df2['or_mychisq'].value_counts().plot(kind = 'density')
|
|
|
|
my_df2['log10_or_mychisq'].value_counts().plot(kind = 'hist')
|
|
my_df2['log10_or_mychisq'].value_counts().plot(kind = 'density')
|
|
|
|
#my_df2['or_fisher'].value_counts().plot(kind = 'hist')
|
|
#my_df2['or_fisher'].value_counts().plot(kind = 'density')
|
|
#%%
|
|
#missing_col = ['or_mychisq']
|
|
|
|
#Technique 2: Using median to impute the missing values
|
|
#for i in missing_col:
|
|
for i in or_cols:
|
|
my_df2.loc[my_df2.loc[:,i].isnull(),i]=my_df2.loc[:,i].median()
|
|
|
|
print("count of NULL values after imputation\n")
|
|
my_df2.isnull().sum()
|
|
|
|
from sklearn.impute import KNNImputer as KNN
|
|
#my_df3 = pd.DataFrame(KNN(n_neighbors= 5, weights="uniform").fit_transform(df), columns = or_cols) #keeps the col names
|
|
my_df3 = pd.DataFrame(KNN(n_neighbors= 5, weights="uniform").fit_transform(df[or_cols]))
|
|
my_df3.columns = or_cols
|
|
|
|
imputer = KNNImputer(n_neighbors=2, weights="uniform")
|
|
my_df4 = pd.DataFrame(imputer.fit_transform(df[or_cols]), columns = or_cols)
|
|
|
|
#all(my_df3==my_df4)
|
|
my_df3['log10_or_mychisq'].value_counts().plot(kind = 'hist')
|
|
my_df3['log10_or_mychisq'].value_counts().plot(kind = 'density')
|
|
|
|
print('\nRaw values:\n', df[or_cols].describe())
|
|
print('\nMedian imputed values:\n', my_df2[or_cols].describe())
|
|
print('\KNN imputed values:\n', my_df3[or_cols].describe())
|
|
my_df4.describe()
|
|
|
|
my_df2.plot.scatter(x='or_mychisq', y = 'log10_or_mychisq', s = 100)
|
|
my_df3.plot.scatter(x='or_mychisq', y = 'log10_or_mychisq', s = 100)
|
|
|