diff --git a/test_data/processing_custom.py b/test_data/processing_custom.py new file mode 100644 index 0000000..fd0858b --- /dev/null +++ b/test_data/processing_custom.py @@ -0,0 +1,209 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Mar 24 15:01:59 2022 + +@author: tanu +""" +import sys, os +import pandas as pd +import numpy as np +from statistics import mean, median, mode +from statistics import multimode +from collections import Counter +from tidy_split import tidy_split + +#import math + +# https://stackoverflow.com/questions/43321455/pandas-count-null-values-in-a-groupby-function +# https://stackoverflow.com/questions/33457191/python-pandas-gene_LF2frame-fill-nans-with-a-conditional-mean +# round up +# int(math.ceil(mean(foo))) +# https://stackoverflow.com/questions/33457191/python-pandas-gene_LF2frame-fill-nans-with-a-conditional-mean +# https://stackoverflow.com/questions/37189878/pandas-add-column-to-groupby-gene_LF2frame +# https://stackoverflow.com/questions/43847520/how-to-get-the-distinct-count-of-values-in-a-python-pandas-gene_LF2frame +#%% Read gene_LF2 and formatting +drug = "pyrazinamide" + +gene_LF2 = pd.read_csv("/home/tanu/git/ML_AI_training/test_gene_LF2/sample_gene_LF2.csv") +gene_LF2.columns + +gene_LF2.head() +#%% Quick checks: Lineage and sample count for each mutation +gene_LF2['id'].nunique() +gene_LF2['mutationinformation'].nunique() +total_id_ucount = gene_LF2['id'].nunique() +total_id_ucount + +gene_LF2.groupby('mutationinformation')['lineage'].size() +gene_LF2.groupby('mutationinformation')['lineage_corrupt'].size() +gene_LF2.groupby('mutationinformation')['id'].size() +gene_LF2.groupby('mutationinformation')['lineage'].value_counts() +gene_LF2.groupby('mutationinformation')['lineage'].nunique() +#%% id count: add all id ids and count of unique ids per mutation +gene_LF2['id_list'] = gene_LF2['mutationinformation'].map(gene_LF2.groupby('mutationinformation')['id'].apply(list)) +gene_LF2['id_ucount'] = gene_LF2['mutationinformation'].map(gene_LF2.groupby('mutationinformation')['id'].nunique()) +gene_LF2[['mutationinformation', 'id', 'id_list', 'id_ucount']] +#%% Lineages: add all lineages and count of unique lineages per mutation +# Lineages good: lineage column has only a single lineage for each mutationinformation +gene_LF2['lineage'] +gene_LF2['lineage_list'] = gene_LF2['mutationinformation'].map(gene_LF2.groupby('mutationinformation')['lineage'].apply(list)) +gene_LF2['lineage_ucount'] = gene_LF2['mutationinformation'].map(gene_LF2.groupby('mutationinformation')['lineage'].nunique()) +gene_LF2[['mutationinformation', 'lineage', 'lineage_list', 'lineage_ucount']] + +# Lineage corrupt: lineage column has only multiple lineages for each mutationinformation separated by ';' +gene_LF2['lineage_corrupt'] +# split using tidy_split() +gene_LF2_split = tidy_split(gene_LF2, 'lineage_corrupt', sep = ';') +# remove leading white space else these are counted as distinct mutations as well +#gene_LF2_split['lineage_corrupt'] = gene_LF2_split['lineage_corrupt'].str.lstrip() +gene_LF2_split['lineage_corrupt'] = gene_LF2_split['lineage_corrupt'].str.strip() +gene_LF2_split.head() + +gene_LF2_split['lineage_corrupt_list'] = gene_LF2_split['mutationinformation'].map(gene_LF2_split.groupby('mutationinformation')['lineage_corrupt'].apply(list)) +gene_LF2_split['lineage_corrupt_ucount'] = gene_LF2_split['mutationinformation'].map(gene_LF2_split.groupby('mutationinformation')['lineage_corrupt'].nunique()) + +gene_LF2_split[['mutationinformation', 'lineage_corrupt_list', 'lineage_corrupt_ucount']] +gene_LF2_split[['mutationinformation', 'lineage_ucount', 'lineage_corrupt_ucount']] + +#%% AF: calculate AF for each mutation +#1) calculate no. of unique ids +gene_LF2['id_ucount']/total_id_ucount + +#%% DM OM labels +# COPY mutation_info_labels column +gene_LF2['mutation_info_labels_orig'] = gene_LF2['mutation_info_labels'] + +# Convert DM/OM labels to numeric +dm_om_map = {'DM': 1, 'OM': 0} # pnca, OM is minority, other genes: DM is minority +gene_LF2['dm_om_numeric'] = gene_LF2['mutation_info_labels'].map(dm_om_map) +# sanity check +gene_LF2['dm_om_numeric'].value_counts() +gene_LF2['mutation_info_labels'].value_counts() + +# Convert drtype column to numeric +drtype_map = {'XDR': 5 + , 'Pre-XDR': 4 + , 'MDR': 3 + , 'Pre-MDR': 2 + , 'Other': 1 + , 'Sensitive': 0} + +gene_LF2['drtype_numeric'] = gene_LF2['drtype'].map(drtype_map) + +# COPY dst column +gene_LF2['dst'] = gene_LF2[drug] # to allow cross checking +gene_LF2['dst_multimode'] = gene_LF2[drug] + +# sanity check +gene_LF2[drug].value_counts() +gene_LF2['dst_multimode'].value_counts() + +gene_LF2[drug].isnull().sum() +gene_LF2['dst_multimode'].isnull().sum() + +gene_LF2['mutationinformation'].value_counts() +#gene_LF2.C.isnull().groupby([df['A'],df['B']]).sum().astype(int).reset_index(name='count') +gene_LF2[drug].isnull().groupby(gene_LF2['mutationinformation']).sum() + +# GOAL is to populate na in the dst column from the count of the dm_om_numeric column +gene_LF2['dst_multimode'].isnull().groupby(gene_LF2['mutationinformation']).sum() + +gene_LF2['mutationinformation'] + +#%% Recalculating dst: my gene_LF2 +#------------------------------ +# Revised dst: max(multimode) +#------------------------------ +# For each mutation, generate the revised dst which is the mode of dm_om_numeric +# PROBLEM: Returns the smallest of the two when bimodal, and fails when all equally likely +# SOLUTION: Using max of the 'dst_noNA' column +#gene_LF22.groupby('mutationinformation')['dm_om_numeric'].agg(multimode) + +# Get multimode for dm_om_numeric column +dm_om_multimode = gene_LF2.groupby('mutationinformation')['dm_om_numeric'].agg(multimode) +#dm_om_multimode + +# Fill using multimode ONLY where NA in dst_multimode column +#gene_LF22['dst_multimode'] = gene_LF22['dst_multimode'].fillna(dm_om_multimode) +gene_LF2['dst_multimode'] = gene_LF2['dst_multimode'].fillna(dm_om_multimode) + +# gene_LF22['dst_multimode'] + +# Now get the max from multimode +gene_LF22['dst_noNA'] = gene_LF2['dst_multimode'].apply(lambda x: np.nanmax(x)) +print(gene_LF2) + +# Finally created a revised dst with the max from the multimode +gene_LF22['dst_mode'] = gene_LF2.groupby('mutationinformation')['dst_noNA'].max() +#============================================================================== +#%% Recalculating drtype: my gene_LF2 +#-------------------------------- +# drtype: ALL values: +# numeric and names in an array +#-------------------------------- +gene_LF2['drtype_all_vals'] = gene_LF2['drtype_numeric'] +gene_LF2['drtype_all_names'] = gene_LF2['drtype'] + +# example: https://stackoverflow.com/questions/55125680/pandas-get-all-groupby-values-in-an-array +# print(df.groupby('key').gene_LF2.apply(list).reset_index()) # my use case, don't need the reset_index() +gene_LF2['drtype_all_vals'] = gene_LF2.groupby('mutationinformation').drtype_all_vals.apply(list) +gene_LF2['drtype_all_names'] = gene_LF2.groupby('mutationinformation').drtype_all_names.apply(list) + +#--------------------------------- +# Revised drtype: max(Multimode) +#-------------------------------- +gene_LF2['drtype_multimode'] = gene_LF2.groupby(['mutationinformation'])['drtype_numeric'].agg(multimode) +gene_LF2['drtype_multimode'] + +# Now get the max from multimode +gene_LF2['drtype_mode'] = gene_LF2['drtype_multimode'].apply(lambda x: np.nanmax(x)) +gene_LF2.head() + +#---------------------- +# Revised drtype: Max +#---------------------- +gene_LF2.head() +gene_LF2['drtype_max'] = gene_LF2.groupby(['mutationinformation'])['drtype_numeric'].max() +#gene_LF2 = gene_LF22.reset_index() +gene_LF2.head() + +#%% Finally reset index +gene_LF2 = gene_LF2.reset_index() +#============================================================================== +#--------------------------------------- +# Create revised mutation_info_column +#--------------------------------------- +gene_LF2['dst_mode'].value_counts() +gene_LF2[drug].value_counts() + +# note this is overriding, since downstream depends on it +# make a copy you if you need to keep that +gene_LF2['mutation_info_labels_orig'] = gene_LF2['mutation_info_labels'] +gene_LF2['mutation_info_labels'] = gene_LF2['dst_mode'].map({1: 'DM' + , 0: 'OM'}) +gene_LF2['mutation_info_labels_orig'].value_counts() +gene_LF2['mutation_info_labels'].value_counts() +#============================================================================== +# sanity check +if (all(gene_LF2['mutation'] == gene_LF2['mutationinformation'])): + print('\nPass: Mutationinformation check successful') +else: + sys.exit('\nERROR: mutationin cross checks failed. Please check your group_by() aggregate functions') + +# Drop mutation column +gene_LF2.drop(['mutation'], axis=1, inplace=True) +#%% subset: equivalent of merged_df3? +# https://stackoverflow.com/questions/39900061/sort-lists-in-a-pandas-gene_LF2frame-column + +# result = gene_LF2['dst_multimode'].sort_values().apply(lambda x: sorted(x)) +# newdf = pd.gene_LF2Frame({'dst_multimode': Series(list(set(result['a'].apply(tuple))))}) +# newdf.sort_values(by='a') + +# gene_LF2['dst_multimode'].value_counts() +# gene_LF2.sort_values(['dst_multimode'], ascending=False) + +#gene_LF2_df3 = gene_LF2.drop_duplicates(['mutationinformation']) +#gene_LF2_df3_v2 = gene_LF2.drop_duplicates(['mutationinformation']) +#all(gene_LF2_df3 == gene_LF2_df3_v2) +#%% diff --git a/test_data/processing_v2.py b/test_data/processing_v2.py new file mode 100644 index 0000000..ad7180e --- /dev/null +++ b/test_data/processing_v2.py @@ -0,0 +1,227 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Mar 24 15:01:59 2022 + +@author: tanu +""" +import sys, os +import pandas as pd +import numpy as np +from statistics import mean, median, mode +from statistics import multimode +from collections import Counter +from tidy_split import tidy_split + +#import math + +# https://stackoverflow.com/questions/43321455/pandas-count-null-values-in-a-groupby-function +# https://stackoverflow.com/questions/33457191/python-pandas-dataframe-fill-nans-with-a-conditional-mean +# round up +# int(math.ceil(mean(foo))) +# https://stackoverflow.com/questions/33457191/python-pandas-dataframe-fill-nans-with-a-conditional-mean +# https://stackoverflow.com/questions/37189878/pandas-add-column-to-groupby-dataframe +# https://stackoverflow.com/questions/43847520/how-to-get-the-distinct-count-of-values-in-a-python-pandas-dataframe +#%% Read data and formatting +drug = "pyrazinamide" + +data = pd.read_csv("/home/tanu/git/ML_AI_training/test_data/sample_data.csv") +data.columns + +data.head() +#%% Quick checks: Lineage and sample count for each mutation +data['id'].nunique() +data['mutationinformation'].nunique() +total_id_ucount = data['id'].nunique() +total_id_ucount + +data.groupby('mutationinformation')['lineage'].size() +data.groupby('mutationinformation')['lineage_corrupt'].size() +data.groupby('mutationinformation')['id'].size() +data.groupby('mutationinformation')['lineage'].value_counts() +data.groupby('mutationinformation')['lineage'].nunique() +#%% id count: add all id ids and count of unique ids per mutation +data['id_list'] = data['mutationinformation'].map(data.groupby('mutationinformation')['id'].apply(list)) +data['id_ucount'] = data['mutationinformation'].map(data.groupby('mutationinformation')['id'].nunique()) +data[['mutationinformation', 'id', 'id_list', 'id_ucount']] +#%% Lineages: add all lineages and count of unique lineages per mutation +# Lineages good: lineage column has only a single lineage for each mutationinformation +data['lineage'] +data['lineage_list'] = data['mutationinformation'].map(data.groupby('mutationinformation')['lineage'].apply(list)) +data['lineage_ucount'] = data['mutationinformation'].map(data.groupby('mutationinformation')['lineage'].nunique()) +data[['mutationinformation', 'lineage', 'lineage_list', 'lineage_ucount']] + +# Lineage corrupt: lineage column has only multiple lineages for each mutationinformation separated by ';' +data['lineage_corrupt'] +# split using tidy_split() +data_split = tidy_split(data, 'lineage_corrupt', sep = ';') +# remove leading white space else these are counted as distinct mutations as well +#data_split['lineage_corrupt'] = data_split['lineage_corrupt'].str.lstrip() +data_split['lineage_corrupt'] = data_split['lineage_corrupt'].str.strip() +data_split.head() + +data_split['lineage_corrupt_list'] = data_split['mutationinformation'].map(data_split.groupby('mutationinformation')['lineage_corrupt'].apply(list)) +data_split['lineage_corrupt_ucount'] = data_split['mutationinformation'].map(data_split.groupby('mutationinformation')['lineage_corrupt'].nunique()) + +data_split[['mutationinformation', 'lineage_corrupt_list', 'lineage_corrupt_ucount']] +data_split[['mutationinformation', 'lineage_ucount', 'lineage_corrupt_ucount']] + +#%% AF: calculate AF for each mutation +#1) calculate no. of unique ids +data['id_ucount']/total_id_ucount + +#%% DM OM labels +# COPY mutation_info_labels column +data['mutation_info_labels_orig'] = data['mutation_info_labels'] + +# Convert DM/OM labels to numeric +dm_om_map = {'DM': 1, 'OM': 0} # pnca, OM is minority, other genes: DM is minority +data['dm_om_numeric'] = data['mutation_info_labels'].map(dm_om_map) +# sanity check +data['dm_om_numeric'].value_counts() +data['mutation_info_labels'].value_counts() + +# Convert drtype column to numeric +drtype_map = {'XDR': 5 + , 'Pre-XDR': 4 + , 'MDR': 3 + , 'Pre-MDR': 2 + , 'Other': 1 + , 'Sensitive': 0} + +data['drtype_numeric'] = data['drtype'].map(drtype_map) + +# COPY dst column +data['dst'] = data[drug] # to allow cross checking +data['dst_multimode'] = data[drug] + +# sanity check +data[drug].value_counts() +data['dst_multimode'].value_counts() + +data[drug].isnull().sum() +data['dst_multimode'].isnull().sum() + +data['dst_multimode'] + +data['mutationinformation'].value_counts() +data[drug].isnull().groupby(data['mutationinformation']).sum() + +# GOAL is to populate na in the dst column from the count of the dm_om_numeric column +data['dst_multimode'].isnull().groupby(data['mutationinformation']).sum() + +# COPY mutationinformation for sanity check +data['mutation'] = data['mutationinformation'] + +#%% POC continued: Test getting mode +#data.groupby('mutationinformation')['dm_om_numeric'].mode() +data.groupby('mutationinformation')['dm_om_numeric'].agg(mode) +data.groupby('mutationinformation')['dm_om_numeric'].agg(multimode) +foo = data.groupby('mutationinformation')['dm_om_numeric'].agg(multimode) +foo +foo = foo.to_frame() +foo['dm_om_numeric'].apply(lambda x: max(x))# returns nan +foo['dm_om_numeric'].apply(lambda x: np.nanmax(x)) +#foo.assign(dst_mode = lambda x: (x['dst'])) +foo['multimode_extract'] = foo['dm_om_numeric'].apply(lambda x: max(x)) +foo['multimode_extract'] +#%% Recalculating columns [dst, drtype and mutation_info_labels]: SET Index as 'mutationinformation' +data2 = data.copy() +# Reset index as it allows the groupby expression to directly map +data2 = data2.set_index(['mutationinformation']) +#%% Recalculating dst: my data +#------------------------------ +# Revised dst: max(multimode) +#------------------------------ +# For each mutation, generate the revised dst which is the mode of dm_om_numeric +# PROBLEM: Returns the smallest of the two when bimodal, and fails when all equally likely +# SOLUTION: Using max of the 'dst_noNA' column +#data2.groupby('mutationinformation')['dm_om_numeric'].agg(multimode) + +# Get multimode for dm_om_numeric column +dm_om_multimode = data2.groupby('mutationinformation')['dm_om_numeric'].agg(multimode) +dm_om_multimode + +# Fill using multimode ONLY where NA in dst_multimode column +#data2['dst_multimode'] = data2['dst_multimode'].fillna(dm_om_multimode) +data2['dst_multimode'] = data2['dst_multimode'].fillna(dm_om_multimode) + +# data2['dst_multimode'] + +# Now get the max from multimode +data2['dst_noNA'] = data2['dst_multimode'].apply(lambda x: np.nanmax(x)) +print(data2) + +# Finally created a revised dst with the max from the multimode +data2['dst_mode'] = data2.groupby('mutationinformation')['dst_noNA'].max() +#============================================================================== +#%% Recalculating drtype: my data +#-------------------------------- +# drtype: ALL values: +# numeric and names in an array +#-------------------------------- +data2['drtype_all_vals'] = data2['drtype_numeric'] +data2['drtype_all_names'] = data2['drtype'] + +# example: https://stackoverflow.com/questions/55125680/pandas-get-all-groupby-values-in-an-array +# print(df.groupby('key').data.apply(list).reset_index()) # my use case, don't need the reset_index() +data2['drtype_all_vals'] = data2.groupby('mutationinformation').drtype_all_vals.apply(list) +data2['drtype_all_names'] = data2.groupby('mutationinformation').drtype_all_names.apply(list) + +#--------------------------------- +# Revised drtype: max(Multimode) +#-------------------------------- +data2['drtype_multimode'] = data2.groupby(['mutationinformation'])['drtype_numeric'].agg(multimode) +data2['drtype_multimode'] + +# Now get the max from multimode +data2['drtype_mode'] = data2['drtype_multimode'].apply(lambda x: np.nanmax(x)) +data2.head() + +#---------------------- +# Revised drtype: Max +#---------------------- +data2.head() +data2['drtype_max'] = data2.groupby(['mutationinformation'])['drtype_numeric'].max() +#data2 = data2.reset_index() +data2.head() + +#%% Finally reset index +data2 = data2.reset_index() +#============================================================================== +#--------------------------------------- +# Create revised mutation_info_column +#--------------------------------------- +data2['dst_mode'].value_counts() +data2[drug].value_counts() + +# note this is overriding, since downstream depends on it +# make a copy you if you need to keep that +data2['mutation_info_labels_orig'] = data2['mutation_info_labels'] +data2['mutation_info_labels'] = data2['dst_mode'].map({1: 'DM' + , 0: 'OM'}) +data2['mutation_info_labels_orig'].value_counts() +data2['mutation_info_labels'].value_counts() +#============================================================================== +# sanity check +if (all(data2['mutation'] == data2['mutationinformation'])): + print('\nPass: Mutationinformation check successful') +else: + sys.exit('\nERROR: mutationin cross checks failed. Please check your group_by() aggregate functions') + +# Drop mutation column +data2.drop(['mutation'], axis=1, inplace=True) +#%% subset: equivalent of merged_df3? +# https://stackoverflow.com/questions/39900061/sort-lists-in-a-pandas-dataframe-column + +# result = data2['dst_multimode'].sort_values().apply(lambda x: sorted(x)) +# newdf = pd.DataFrame({'dst_multimode': Series(list(set(result['a'].apply(tuple))))}) +# newdf.sort_values(by='a') + +# data2['dst_multimode'].value_counts() +# data2.sort_values(['dst_multimode'], ascending=False) + +#data_df3 = data2.drop_duplicates(['mutationinformation']) +#data_df3_v2 = data2.drop_duplicates(['mutationinformation']) +#all(data_df3 == data_df3_v2) +#%% \ No newline at end of file diff --git a/test_data/sample_data_pivot.ods b/test_data/sample_data_pivot.ods new file mode 100644 index 0000000..d1e2394 Binary files /dev/null and b/test_data/sample_data_pivot.ods differ diff --git a/test_data/snippet_res_pnca.py b/test_data/snippet_res_pnca.py new file mode 100644 index 0000000..e92f7c2 --- /dev/null +++ b/test_data/snippet_res_pnca.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Apr 21 14:54:26 2022 + +@author: tanu +""" +RESULTS: +Total WT in dr_muts_col: 30761 +Total matches of pncA SNP matches in dr_mutations_pyrazinamide : 3973 +Total samples with > 1 pncA nsSNPs in dr_muts_col: 73 +Total matches of UNIQUE pncA SNP matches in dr_mutations_pyrazinamide : 227 +=================================================================RESULTS: +Total WT in other_muts_col: 31653 +Total matches of pncA SNP matches in other_mutations_pyrazinamide : 943 +Total samples with > 1 pncA nsSNPs in other_muts_col: 14 +Total matches of UNIQUE pncA SNP matches in other_mutations_pyrazinamide : 200 +================================================================= + +# First copy index colum +# then use the Vcounts analogy for counts