added lineage and af count accounting for corrupt data

This commit is contained in:
Tanushree Tunstall 2022-04-08 17:00:57 +01:00
parent 28d0d68413
commit 409caaf0bc
2 changed files with 77 additions and 40 deletions

View file

@ -11,19 +11,66 @@ import numpy as np
from statistics import mean, median, mode from statistics import mean, median, mode
from statistics import multimode from statistics import multimode
from collections import Counter from collections import Counter
from tidy_split import tidy_split
#import math #import math
# https://stackoverflow.com/questions/43321455/pandas-count-null-values-in-a-groupby-function # https://stackoverflow.com/questions/43321455/pandas-count-null-values-in-a-groupby-function
# https://stackoverflow.com/questions/33457191/python-pandas-dataframe-fill-nans-with-a-conditional-mean # https://stackoverflow.com/questions/33457191/python-pandas-dataframe-fill-nans-with-a-conditional-mean
# round up # round up
#int(math.ceil(mean(foo))) # int(math.ceil(mean(foo)))
#https://stackoverflow.com/questions/33457191/python-pandas-dataframe-fill-nans-with-a-conditional-mean # https://stackoverflow.com/questions/33457191/python-pandas-dataframe-fill-nans-with-a-conditional-mean
# https://stackoverflow.com/questions/37189878/pandas-add-column-to-groupby-dataframe
# https://stackoverflow.com/questions/43847520/how-to-get-the-distinct-count-of-values-in-a-python-pandas-dataframe
#%% Read data and formatting #%% Read data and formatting
drug = "pyrazinamide" drug = "pyrazinamide"
data = pd.read_csv("/home/tanu/git/ML_AI_training/test_data/sample_data.csv") data = pd.read_csv("/home/tanu/git/ML_AI_training/test_data/sample_data.csv")
data.columns data.columns
data.head()
#%% Quick checks: Lineage and sample count for each mutation
data['id'].nunique()
data['mutationinformation'].nunique()
total_id_ucount = data['id'].nunique()
total_id_ucount
data.groupby('mutationinformation')['lineage'].size()
data.groupby('mutationinformation')['lineage_corrupt'].size()
data.groupby('mutationinformation')['id'].size()
data.groupby('mutationinformation')['lineage'].value_counts()
data.groupby('mutationinformation')['lineage'].nunique()
#%% id count: add all id ids and count of unique ids per mutation
data['id_list'] = data['mutationinformation'].map(data.groupby('mutationinformation')['id'].apply(list))
data['id_ucount'] = data['mutationinformation'].map(data.groupby('mutationinformation')['id'].nunique())
data[['mutationinformation', 'id', 'id_list', 'id_ucount']]
#%% Lineages: add all lineages and count of unique lineages per mutation
# Lineages good: lineage column has only a single lineage for each mutationinformation
data['lineage']
data['lineage_list'] = data['mutationinformation'].map(data.groupby('mutationinformation')['lineage'].apply(list))
data['lineage_ucount'] = data['mutationinformation'].map(data.groupby('mutationinformation')['lineage'].nunique())
data[['mutationinformation', 'lineage', 'lineage_list', 'lineage_ucount']]
# Lineage corrupt: lineage column has only multiple lineages for each mutationinformation separated by ';'
data['lineage_corrupt']
# split using tidy_split()
data_split = tidy_split(data, 'lineage_corrupt', sep = ';')
# remove leading white space else these are counted as distinct mutations as well
#data_split['lineage_corrupt'] = data_split['lineage_corrupt'].str.lstrip()
data_split['lineage_corrupt'] = data_split['lineage_corrupt'].str.strip()
data_split.head()
data_split['lineage_corrupt_list'] = data_split['mutationinformation'].map(data_split.groupby('mutationinformation')['lineage_corrupt'].apply(list))
data_split['lineage_corrupt_ucount'] = data_split['mutationinformation'].map(data_split.groupby('mutationinformation')['lineage_corrupt'].nunique())
data_split[['mutationinformation', 'lineage_corrupt_list', 'lineage_corrupt_ucount']]
data_split[['mutationinformation', 'lineage_ucount', 'lineage_corrupt_ucount']]
#%% AF: calculate AF for each mutation
#1) calculate no. of unique ids
data['id_ucount']/total_id_ucount
#%% DM OM labels
# COPY mutation_info_labels column # COPY mutation_info_labels column
data['mutation_info_labels_orig'] = data['mutation_info_labels'] data['mutation_info_labels_orig'] = data['mutation_info_labels']
@ -172,18 +219,6 @@ else:
# Drop mutation column # Drop mutation column
data2.drop(['mutation'], axis=1, inplace=True) data2.drop(['mutation'], axis=1, inplace=True)
#%% Process lineage info
# add how many different lineages a sample is represented in?
# https://stackoverflow.com/questions/37189878/pandas-add-column-to-groupby-dataframe
# https://stackoverflow.com/questions/43847520/how-to-get-the-distinct-count-of-values-in-a-python-pandas-dataframe
data2.groupby('mutationinformation')['lineage'].size() # sample count
data2.groupby('mutationinformation')['sample'].size()
data2.groupby('mutationinformation')['lineage'].value_counts()
data2.groupby('mutationinformation')['lineage'].nunique()
data2['lin_count'] = data2['mutationinformation'].map(data2.groupby('mutationinformation')['lineage'].nunique())
#%% subset: equivalent of merged_df3? #%% subset: equivalent of merged_df3?
# https://stackoverflow.com/questions/39900061/sort-lists-in-a-pandas-dataframe-column # https://stackoverflow.com/questions/39900061/sort-lists-in-a-pandas-dataframe-column

View file

@ -1,26 +1,28 @@
sample,mutationinformation,position,pyrazinamide,mutation_info_labels,drtype,lineage id,old,mutationinformation,position,pyrazinamide,mutation_info_labels,drtype,lineage_corrupt,lineage
S1,M1A,1,0,DM,MDR,l1 S1,S1,M1A,1,0,DM,MDR,l1; l3; l4 ,l1
S2,M1A,1,1,DM,Pre-MDR,l2 S2,S2,M1A,1,1,DM,Pre-MDR,l2,l2
S3,M1A,1,1,OM,Sensitive,l1 S3,S3,M1A,1,1,OM,Sensitive,l1,l1
S4,M1A,1,NA,OM,Other,l3 S4,S4,M1A,1,NA,OM,Other,l3,l3
S5,M1A,1,1,OM,Pre-XDR,l2 S5,S5,M1A,1,1,OM,Pre-XDR,l2,l2
S6,M1A,1,1,DM,XDR,l4 S6,S6,M1A,1,1,DM,XDR,l4,l3
S7,M1B,1,NA,OM,MDR,l1 S1,S7,M1B,1,NA,OM,MDR,l1,l1
S8,M1B,1,1,DM,Other,l1 S7,S8,M1B,1,1,DM,Other,l1,l1
S9,M1B,1,NA,DM,Other,l2 S8,S9,M1B,1,NA,DM,Other,l2,l2
S10,M1B,1,0,OM,Sensitive,l2 S2,S10,M1B,1,0,OM,Sensitive,l2,l2
S11,M1C,1,NA,OM,Pre-XDR,l3 S3,S11,M1C,1,NA,OM,Pre-XDR,l3,l3
S12,M1C,1,NA,OM,Pre-XDR,l1 S4,S12,M1C,1,NA,OM,Pre-XDR,l1,l1
S13,M1C,1,1,OM,MDR,l1 S8,S13,M1C,1,1,OM,MDR,l1,l1
S14,M1C,1,NA,DM,MDR,l2 S8,S14,M1C,1,NA,DM,MDR,l2,l2
S15,A2B,2,0,OM,Other,l4 S4,S15,A2B,2,0,OM,Other,l4,l4
S16,A2B,2,0,OM,XDR,l4 S3,S16,A2B,2,0,OM,XDR,l4,l4
S17,A2C,2,NA,DM,Pre-MDR,l5 S2,S17,A2C,2,NA,DM,Pre-MDR,l5,l5
S18,A2C,2,1,DM,Pre-MDR,l1 S1,S18,A2C,2,1,DM,Pre-MDR,l1,l1
S19,D3E,3,1,DM,XDR,l2 S7,S19,D3E,3,1,DM,XDR,l2,l2
S20,D3E,3,NA,DM,MDR,l2 S8,S20,D3E,3,NA,DM,MDR,l2,l2
S21,D3E,3,NA,OM,Pre-MDR,l1 S8,S21,D3E,3,NA,OM,Pre-MDR,l1,l1
S22,D3P,3,0,OM,Pre-MDR,l2 S5,S22,D3P,3,0,OM,Pre-MDR,l2,l2
S23,D3A,3,0,OM,Sensitive,l5 S6,S23,D3A,3,0,OM,Sensitive,l5,l5
S24,P4A,4,NA,OM,Other,l6 S7,S24,P4A,4,NA,OM,Other,l6,l6
S25,P5A,5,1,DM,Sensitive,l4 S8,S25,P5A,5,1,DM,Sensitive,l4,l4
S8,S26,Q6L,6,1,DM,Others,l2,l2
S4,S27,Q6L,6,NA,OM,MDR,l5; l2,l5

1 id sample old mutationinformation position pyrazinamide mutation_info_labels drtype lineage_corrupt lineage
2 S1 S1 M1A 1 0 DM MDR l1; l3; l4 l1
3 S2 S2 M1A 1 1 DM Pre-MDR l2 l2
4 S3 S3 M1A 1 1 OM Sensitive l1 l1
5 S4 S4 M1A 1 NA OM Other l3 l3
6 S5 S5 M1A 1 1 OM Pre-XDR l2 l2
7 S6 S6 M1A 1 1 DM XDR l4 l4 l3
8 S1 S7 M1B 1 NA OM MDR l1 l1
9 S7 S8 M1B 1 1 DM Other l1 l1
10 S8 S9 M1B 1 NA DM Other l2 l2
11 S2 S10 M1B 1 0 OM Sensitive l2 l2
12 S3 S11 M1C 1 NA OM Pre-XDR l3 l3
13 S4 S12 M1C 1 NA OM Pre-XDR l1 l1
14 S8 S13 M1C 1 1 OM MDR l1 l1
15 S8 S14 M1C 1 NA DM MDR l2 l2
16 S4 S15 A2B 2 0 OM Other l4 l4
17 S3 S16 A2B 2 0 OM XDR l4 l4
18 S2 S17 A2C 2 NA DM Pre-MDR l5 l5
19 S1 S18 A2C 2 1 DM Pre-MDR l1 l1
20 S7 S19 D3E 3 1 DM XDR l2 l2
21 S8 S20 D3E 3 NA DM MDR l2 l2
22 S8 S21 D3E 3 NA OM Pre-MDR l1 l1
23 S5 S22 D3P 3 0 OM Pre-MDR l2 l2
24 S6 S23 D3A 3 0 OM Sensitive l5 l5
25 S7 S24 P4A 4 NA OM Other l6 l6
26 S8 S25 P5A 5 1 DM Sensitive l4 l4
27 S8 S26 Q6L 6 1 DM Others l2 l2
28 S4 S27 Q6L 6 NA OM MDR l5; l2 l5