added sample test data for processing to get correct annotations
This commit is contained in:
parent
005efb1e0e
commit
6a9d23ec8f
3 changed files with 87 additions and 0 deletions
61
test_data/processing.py
Normal file
61
test_data/processing.py
Normal file
|
@ -0,0 +1,61 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Thu Mar 24 15:01:59 2022
|
||||||
|
|
||||||
|
@author: tanu
|
||||||
|
"""
|
||||||
|
import sys, os
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from statistics import mean, median, mode
|
||||||
|
#from statistics import multimode
|
||||||
|
from collections import Counter
|
||||||
|
import math
|
||||||
|
|
||||||
|
# https://stackoverflow.com/questions/43321455/pandas-count-null-values-in-a-groupby-function
|
||||||
|
#https://stackoverflow.com/questions/33457191/python-pandas-dataframe-fill-nans-with-a-conditional-mean
|
||||||
|
#%%
|
||||||
|
drug = "pyrazinamide"
|
||||||
|
|
||||||
|
data = pd.read_csv("/home/tanu/git/ML_AI_training/test_data/sample_data.csv")
|
||||||
|
data.columns
|
||||||
|
|
||||||
|
# Convert DM/OM labels to numeric
|
||||||
|
dm_om_map = {'DM': 1, 'OM': 0} # pnca, OM is minority, other genes: DM is minority
|
||||||
|
data['dm_om_numeric'] = data['mutation_info_labels'].map(dm_om_map)
|
||||||
|
# sanity check
|
||||||
|
data['dm_om_numeric'].value_counts()
|
||||||
|
data['mutation_info_labels'].value_counts()
|
||||||
|
|
||||||
|
# COPY dst column
|
||||||
|
data['dst'] = data[drug]
|
||||||
|
# sanity check
|
||||||
|
data[drug].value_counts()
|
||||||
|
data[drug].isnull().sum()
|
||||||
|
|
||||||
|
data['dst'].value_counts()
|
||||||
|
data['dst'].isnull().sum()
|
||||||
|
|
||||||
|
data['mutationinformation'].value_counts()
|
||||||
|
#data.C.isnull().groupby([df['A'],df['B']]).sum().astype(int).reset_index(name='count')
|
||||||
|
data[drug].isnull().groupby(data['mutationinformation']).sum()
|
||||||
|
|
||||||
|
# GOAL is to populate na in the dst column from the count of the dm_om_numeric column
|
||||||
|
data['dst'].isnull().groupby(data['mutationinformation']).sum()
|
||||||
|
|
||||||
|
|
||||||
|
# round up
|
||||||
|
int(math.ceil(mean(foo)))
|
||||||
|
#https://stackoverflow.com/questions/33457191/python-pandas-dataframe-fill-nans-with-a-conditional-mean
|
||||||
|
#FIXME
|
||||||
|
# STAGE 1: replace mean with Max(multimode), atm it is MEAN
|
||||||
|
#na_val = data.groupby(data['mutationinformation'])['dst'].mean()
|
||||||
|
|
||||||
|
data['dst'] = data['dst'].fillna(data.groupby('mutationinformation')['dst'].transform('mean'))
|
||||||
|
|
||||||
|
# FIXME
|
||||||
|
#STAGE 2: Fill TRUE nan with DM.OM column value, atm it is MEAN
|
||||||
|
data['dst2'] = data['dst'].fillna(data.groupby('mutationinformation')['dm_om_numeric'].transform('mean'))
|
||||||
|
data['dst2'] = data['dst'].fillna(data.groupby('mutationinformation').transform(['dm_om_numeric']))
|
||||||
|
|
26
test_data/sample_data.csv
Normal file
26
test_data/sample_data.csv
Normal file
|
@ -0,0 +1,26 @@
|
||||||
|
id,mutationinformation,position,pyrazinamide,mutation_info_labels,drtype
|
||||||
|
S1,M1A,1,0,DM,MDR
|
||||||
|
S2,M1A,1,1,DM,Pre-MDR
|
||||||
|
S3,M1A,1,1,OM,Sensitive
|
||||||
|
S4,M1A,1,NA,OM,Others
|
||||||
|
S5,M1A,1,1,OM,Pre-XDR
|
||||||
|
S6,M1A,1,1,DM,XDR
|
||||||
|
S7,M1B,1,NA,OM,MDR
|
||||||
|
S8,M1B,1,1,DM,MDR
|
||||||
|
S9,M1B,1,NA,DM,Other
|
||||||
|
S10,M1B,1,0,OM,Sensitive
|
||||||
|
S11,M1C,1,NA,OM,Pre-XDR
|
||||||
|
S12,M1C,1,NA,OM,Pre-XDR
|
||||||
|
S13,M1C,1,1,OM,MDR
|
||||||
|
S14,M1C,1,NA,DM,MDR
|
||||||
|
S15,A2B,2,0,OM,Others
|
||||||
|
S16,A2B,2,0,OM,XDR
|
||||||
|
S17,A2C,2,NA,DM,Pre-MDR
|
||||||
|
S18,A2C,2,1,DM,Pre-MDR
|
||||||
|
S19,D3E,3,1,DM,XDR
|
||||||
|
S20,D3E,3,NA,DM,MDR
|
||||||
|
S21,D3E,3,NA,OM,Pre-MDR
|
||||||
|
S22,D3P,3,0,OM,Pre-MDR
|
||||||
|
S23,D3A,3,0,OM,Sensitive
|
||||||
|
S24,P4A,4,NA,OM,Others
|
||||||
|
S25,P5A,5,1,DM,Sensitive
|
|
BIN
test_data/sample_data.ods
Normal file
BIN
test_data/sample_data.ods
Normal file
Binary file not shown.
Loading…
Add table
Add a link
Reference in a new issue