diff --git a/test_data/processing.py b/test_data/processing.py new file mode 100644 index 0000000..011eae2 --- /dev/null +++ b/test_data/processing.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Mar 24 15:01:59 2022 + +@author: tanu +""" +import sys, os +import pandas as pd +import numpy as np +from statistics import mean, median, mode +#from statistics import multimode +from collections import Counter +import math + +# https://stackoverflow.com/questions/43321455/pandas-count-null-values-in-a-groupby-function +#https://stackoverflow.com/questions/33457191/python-pandas-dataframe-fill-nans-with-a-conditional-mean +#%% +drug = "pyrazinamide" + +data = pd.read_csv("/home/tanu/git/ML_AI_training/test_data/sample_data.csv") +data.columns + +# Convert DM/OM labels to numeric +dm_om_map = {'DM': 1, 'OM': 0} # pnca, OM is minority, other genes: DM is minority +data['dm_om_numeric'] = data['mutation_info_labels'].map(dm_om_map) +# sanity check +data['dm_om_numeric'].value_counts() +data['mutation_info_labels'].value_counts() + +# COPY dst column +data['dst'] = data[drug] +# sanity check +data[drug].value_counts() +data[drug].isnull().sum() + +data['dst'].value_counts() +data['dst'].isnull().sum() + +data['mutationinformation'].value_counts() +#data.C.isnull().groupby([df['A'],df['B']]).sum().astype(int).reset_index(name='count') +data[drug].isnull().groupby(data['mutationinformation']).sum() + +# GOAL is to populate na in the dst column from the count of the dm_om_numeric column +data['dst'].isnull().groupby(data['mutationinformation']).sum() + + +# round up +int(math.ceil(mean(foo))) +#https://stackoverflow.com/questions/33457191/python-pandas-dataframe-fill-nans-with-a-conditional-mean +#FIXME +# STAGE 1: replace mean with Max(multimode), atm it is MEAN +#na_val = data.groupby(data['mutationinformation'])['dst'].mean() + +data['dst'] = data['dst'].fillna(data.groupby('mutationinformation')['dst'].transform('mean')) + +# FIXME +#STAGE 2: Fill TRUE nan with DM.OM column value, atm it is MEAN +data['dst2'] = data['dst'].fillna(data.groupby('mutationinformation')['dm_om_numeric'].transform('mean')) +data['dst2'] = data['dst'].fillna(data.groupby('mutationinformation').transform(['dm_om_numeric'])) + \ No newline at end of file diff --git a/test_data/sample_data.csv b/test_data/sample_data.csv new file mode 100644 index 0000000..5fd5515 --- /dev/null +++ b/test_data/sample_data.csv @@ -0,0 +1,26 @@ +id,mutationinformation,position,pyrazinamide,mutation_info_labels,drtype +S1,M1A,1,0,DM,MDR +S2,M1A,1,1,DM,Pre-MDR +S3,M1A,1,1,OM,Sensitive +S4,M1A,1,NA,OM,Others +S5,M1A,1,1,OM,Pre-XDR +S6,M1A,1,1,DM,XDR +S7,M1B,1,NA,OM,MDR +S8,M1B,1,1,DM,MDR +S9,M1B,1,NA,DM,Other +S10,M1B,1,0,OM,Sensitive +S11,M1C,1,NA,OM,Pre-XDR +S12,M1C,1,NA,OM,Pre-XDR +S13,M1C,1,1,OM,MDR +S14,M1C,1,NA,DM,MDR +S15,A2B,2,0,OM,Others +S16,A2B,2,0,OM,XDR +S17,A2C,2,NA,DM,Pre-MDR +S18,A2C,2,1,DM,Pre-MDR +S19,D3E,3,1,DM,XDR +S20,D3E,3,NA,DM,MDR +S21,D3E,3,NA,OM,Pre-MDR +S22,D3P,3,0,OM,Pre-MDR +S23,D3A,3,0,OM,Sensitive +S24,P4A,4,NA,OM,Others +S25,P5A,5,1,DM,Sensitive diff --git a/test_data/sample_data.ods b/test_data/sample_data.ods new file mode 100644 index 0000000..aff3cca Binary files /dev/null and b/test_data/sample_data.ods differ