#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Thu Mar 24 15:01:59 2022 @author: tanu """ import sys, os import pandas as pd import numpy as np from statistics import mean, median, mode from statistics import multimode from collections import Counter from tidy_split import tidy_split #import math # https://stackoverflow.com/questions/43321455/pandas-count-null-values-in-a-groupby-function # https://stackoverflow.com/questions/33457191/python-pandas-dataframe-fill-nans-with-a-conditional-mean # round up # int(math.ceil(mean(foo))) # https://stackoverflow.com/questions/33457191/python-pandas-dataframe-fill-nans-with-a-conditional-mean # https://stackoverflow.com/questions/37189878/pandas-add-column-to-groupby-dataframe # https://stackoverflow.com/questions/43847520/how-to-get-the-distinct-count-of-values-in-a-python-pandas-dataframe #%% Read data and formatting drug = "pyrazinamide" data = pd.read_csv("/home/tanu/git/ML_AI_training/test_data/sample_data.csv") data.columns data.head() #%% Quick checks: Lineage and sample count for each mutation data['id'].nunique() data['mutationinformation'].nunique() total_id_ucount = data['id'].nunique() total_id_ucount data.groupby('mutationinformation')['lineage'].size() data.groupby('mutationinformation')['lineage_corrupt'].size() data.groupby('mutationinformation')['id'].size() data.groupby('mutationinformation')['lineage'].value_counts() data.groupby('mutationinformation')['lineage'].nunique() #%% id count: add all id ids and count of unique ids per mutation data['id_list'] = data['mutationinformation'].map(data.groupby('mutationinformation')['id'].apply(list)) data['id_ucount'] = data['mutationinformation'].map(data.groupby('mutationinformation')['id'].nunique()) data[['mutationinformation', 'id', 'id_list', 'id_ucount']] #%% Lineages: add all lineages and count of unique lineages per mutation # Lineages good: lineage column has only a single lineage for each mutationinformation data['lineage'] data['lineage_list'] = data['mutationinformation'].map(data.groupby('mutationinformation')['lineage'].apply(list)) data['lineage_ucount'] = data['mutationinformation'].map(data.groupby('mutationinformation')['lineage'].nunique()) data[['mutationinformation', 'lineage', 'lineage_list', 'lineage_ucount']] # Lineage corrupt: lineage column has only multiple lineages for each mutationinformation separated by ';' data['lineage_corrupt'] # split using tidy_split() data_split = tidy_split(data, 'lineage_corrupt', sep = ';') # remove leading white space else these are counted as distinct mutations as well #data_split['lineage_corrupt'] = data_split['lineage_corrupt'].str.lstrip() data_split['lineage_corrupt'] = data_split['lineage_corrupt'].str.strip() data_split.head() data_split['lineage_corrupt_list'] = data_split['mutationinformation'].map(data_split.groupby('mutationinformation')['lineage_corrupt'].apply(list)) data_split['lineage_corrupt_ucount'] = data_split['mutationinformation'].map(data_split.groupby('mutationinformation')['lineage_corrupt'].nunique()) data_split[['mutationinformation', 'lineage_corrupt_list', 'lineage_corrupt_ucount']] data_split[['mutationinformation', 'lineage_ucount', 'lineage_corrupt_ucount']] #%% AF: calculate AF for each mutation #1) calculate no. of unique ids data['id_ucount']/total_id_ucount #%% DM OM labels # COPY mutation_info_labels column data['mutation_info_labels_orig'] = data['mutation_info_labels'] # Convert DM/OM labels to numeric dm_om_map = {'DM': 1, 'OM': 0} # pnca, OM is minority, other genes: DM is minority data['dm_om_numeric'] = data['mutation_info_labels'].map(dm_om_map) # sanity check data['dm_om_numeric'].value_counts() data['mutation_info_labels'].value_counts() # Convert drtype column to numeric drtype_map = {'XDR': 5 , 'Pre-XDR': 4 , 'MDR': 3 , 'Pre-MDR': 2 , 'Other': 1 , 'Sensitive': 0} data['drtype_numeric'] = data['drtype'].map(drtype_map) # COPY dst column data['dst'] = data[drug] # to allow cross checking data['dst_multimode'] = data[drug] # sanity check data[drug].value_counts() data['dst_multimode'].value_counts() data[drug].isnull().sum() data['dst_multimode'].isnull().sum() data['mutationinformation'].value_counts() #data.C.isnull().groupby([df['A'],df['B']]).sum().astype(int).reset_index(name='count') data[drug].isnull().groupby(data['mutationinformation']).sum() # GOAL is to populate na in the dst column from the count of the dm_om_numeric column data['dst_multimode'].isnull().groupby(data['mutationinformation']).sum() # COPY mutationinformation for sanity check data['mutation'] = data['mutationinformation'] #%% POC: fill na with mean/mode/median/max for each mutation # STAGE 1: replace mean with Max(multimode), atm it is MEAN #na_val = data.groupby(data['mutationinformation'])['dst'].mean() data['dst_multimode'].fillna(data.groupby('mutationinformation')['dst_multimode'].transform('mean')) data['dst_multimode'].fillna(data.groupby('mutationinformation')['dm_om_numeric'].transform('mean')) # STAGE 2: Fill TRUE nan with DM.OM column value, atm it is MEAN #data['dst_mean_check'] = data['dst'].fillna(data.groupby('mutationinformation')['dst'].transform('mean')) #data['dst_mean_check'] = data['dst'].fillna(data.groupby('mutationinformation')['dm_om_numeric'].transform('mean')) #%% POC continued: Test getting mode #data.groupby('mutationinformation')['dm_om_numeric'].mode() data.groupby('mutationinformation')['dm_om_numeric'].agg(mode) data.groupby('mutationinformation')['dm_om_numeric'].agg(multimode) foo = data.groupby('mutationinformation')['dm_om_numeric'].agg(multimode) foo foo = foo.to_frame() foo['dm_om_numeric'].apply(lambda x: max(x))# returns nan foo['dm_om_numeric'].apply(lambda x: np.nanmax(x)) #foo.assign(dst_mode = lambda x: (x['dst'])) foo['multimode_extract'] = foo['dm_om_numeric'].apply(lambda x: max(x)) foo['multimode_extract'] #%% Recalculating columns [dst, drtype and mutation_info_labels]: SET Index as 'mutationinformation' data2 = data.copy() # Reset index as it allows the groupby expression to directly map data2 = data2.set_index(['mutationinformation']) #%% Recalculating dst: my data #------------------------------ # Revised dst: max(multimode) #------------------------------ # For each mutation, generate the revised dst which is the mode of dm_om_numeric # PROBLEM: Returns the smallest of the two when bimodal, and fails when all equally likely # SOLUTION: Using max of the 'dst_noNA' column #data2.groupby('mutationinformation')['dm_om_numeric'].agg(multimode) # Get multimode for dm_om_numeric column dm_om_multimode = data2.groupby('mutationinformation')['dm_om_numeric'].agg(multimode) #dm_om_multimode # Fill using multimode ONLY where NA in dst_multimode column #data2['dst_multimode'] = data2['dst_multimode'].fillna(dm_om_multimode) data2['dst_multimode'] = data2['dst_multimode'].fillna(dm_om_multimode) # data2['dst_multimode'] # Now get the max from multimode data2['dst_noNA'] = data2['dst_multimode'].apply(lambda x: np.nanmax(x)) print(data2) # Finally created a revised dst with the max from the multimode data2['dst_mode'] = data2.groupby('mutationinformation')['dst_noNA'].max() #============================================================================== #%% Recalculating drtype: my data #-------------------------------- # drtype: ALL values: # numeric and names in an array #-------------------------------- data2['drtype_all_vals'] = data2['drtype_numeric'] data2['drtype_all_names'] = data2['drtype'] # example: https://stackoverflow.com/questions/55125680/pandas-get-all-groupby-values-in-an-array # print(df.groupby('key').data.apply(list).reset_index()) # my use case, don't need the reset_index() data2['drtype_all_vals'] = data2.groupby('mutationinformation').drtype_all_vals.apply(list) data2['drtype_all_names'] = data2.groupby('mutationinformation').drtype_all_names.apply(list) #--------------------------------- # Revised drtype: max(Multimode) #-------------------------------- data2['drtype_multimode'] = data2.groupby(['mutationinformation'])['drtype_numeric'].agg(multimode) data2['drtype_multimode'] # Now get the max from multimode data2['drtype_mode'] = data2['drtype_multimode'].apply(lambda x: np.nanmax(x)) data2.head() #---------------------- # Revised drtype: Max #---------------------- data2.head() data2['drtype_max'] = data2.groupby(['mutationinformation'])['drtype_numeric'].max() #data2 = data2.reset_index() data2.head() #%% Finally reset index data2 = data2.reset_index() #============================================================================== #--------------------------------------- # Create revised mutation_info_column #--------------------------------------- data2['dst_mode'].value_counts() data2[drug].value_counts() # note this is overriding, since downstream depends on it # make a copy you if you need to keep that data2['mutation_info_labels_orig'] = data2['mutation_info_labels'] data2['mutation_info_labels'] = data2['dst_mode'].map({1: 'DM' , 0: 'OM'}) data2['mutation_info_labels_orig'].value_counts() data2['mutation_info_labels'].value_counts() #============================================================================== # sanity check if (all(data2['mutation'] == data2['mutationinformation'])): print('\nPass: Mutationinformation check successful') else: sys.exit('\nERROR: mutationin cross checks failed. Please check your group_by() aggregate functions') # Drop mutation column data2.drop(['mutation'], axis=1, inplace=True) #%% subset: equivalent of merged_df3? # https://stackoverflow.com/questions/39900061/sort-lists-in-a-pandas-dataframe-column # result = data2['dst_multimode'].sort_values().apply(lambda x: sorted(x)) # newdf = pd.DataFrame({'dst_multimode': Series(list(set(result['a'].apply(tuple))))}) # newdf.sort_values(by='a') # data2['dst_multimode'].value_counts() # data2.sort_values(['dst_multimode'], ascending=False) #data_df3 = data2.drop_duplicates(['mutationinformation']) #data_df3_v2 = data2.drop_duplicates(['mutationinformation']) #all(data_df3 == data_df3_v2) #%%