added lineage and af count accounting for corrupt data

This commit is contained in:
Tanushree Tunstall 2022-04-08 17:00:57 +01:00
parent 28d0d68413
commit 409caaf0bc
2 changed files with 77 additions and 40 deletions

View file

@ -11,19 +11,66 @@ import numpy as np
from statistics import mean, median, mode
from statistics import multimode
from collections import Counter
from tidy_split import tidy_split
#import math
# https://stackoverflow.com/questions/43321455/pandas-count-null-values-in-a-groupby-function
# https://stackoverflow.com/questions/33457191/python-pandas-dataframe-fill-nans-with-a-conditional-mean
# round up
#int(math.ceil(mean(foo)))
#https://stackoverflow.com/questions/33457191/python-pandas-dataframe-fill-nans-with-a-conditional-mean
# int(math.ceil(mean(foo)))
# https://stackoverflow.com/questions/33457191/python-pandas-dataframe-fill-nans-with-a-conditional-mean
# https://stackoverflow.com/questions/37189878/pandas-add-column-to-groupby-dataframe
# https://stackoverflow.com/questions/43847520/how-to-get-the-distinct-count-of-values-in-a-python-pandas-dataframe
#%% Read data and formatting
drug = "pyrazinamide"
data = pd.read_csv("/home/tanu/git/ML_AI_training/test_data/sample_data.csv")
data.columns
data.head()
#%% Quick checks: Lineage and sample count for each mutation
data['id'].nunique()
data['mutationinformation'].nunique()
total_id_ucount = data['id'].nunique()
total_id_ucount
data.groupby('mutationinformation')['lineage'].size()
data.groupby('mutationinformation')['lineage_corrupt'].size()
data.groupby('mutationinformation')['id'].size()
data.groupby('mutationinformation')['lineage'].value_counts()
data.groupby('mutationinformation')['lineage'].nunique()
#%% id count: add all id ids and count of unique ids per mutation
data['id_list'] = data['mutationinformation'].map(data.groupby('mutationinformation')['id'].apply(list))
data['id_ucount'] = data['mutationinformation'].map(data.groupby('mutationinformation')['id'].nunique())
data[['mutationinformation', 'id', 'id_list', 'id_ucount']]
#%% Lineages: add all lineages and count of unique lineages per mutation
# Lineages good: lineage column has only a single lineage for each mutationinformation
data['lineage']
data['lineage_list'] = data['mutationinformation'].map(data.groupby('mutationinformation')['lineage'].apply(list))
data['lineage_ucount'] = data['mutationinformation'].map(data.groupby('mutationinformation')['lineage'].nunique())
data[['mutationinformation', 'lineage', 'lineage_list', 'lineage_ucount']]
# Lineage corrupt: lineage column has only multiple lineages for each mutationinformation separated by ';'
data['lineage_corrupt']
# split using tidy_split()
data_split = tidy_split(data, 'lineage_corrupt', sep = ';')
# remove leading white space else these are counted as distinct mutations as well
#data_split['lineage_corrupt'] = data_split['lineage_corrupt'].str.lstrip()
data_split['lineage_corrupt'] = data_split['lineage_corrupt'].str.strip()
data_split.head()
data_split['lineage_corrupt_list'] = data_split['mutationinformation'].map(data_split.groupby('mutationinformation')['lineage_corrupt'].apply(list))
data_split['lineage_corrupt_ucount'] = data_split['mutationinformation'].map(data_split.groupby('mutationinformation')['lineage_corrupt'].nunique())
data_split[['mutationinformation', 'lineage_corrupt_list', 'lineage_corrupt_ucount']]
data_split[['mutationinformation', 'lineage_ucount', 'lineage_corrupt_ucount']]
#%% AF: calculate AF for each mutation
#1) calculate no. of unique ids
data['id_ucount']/total_id_ucount
#%% DM OM labels
# COPY mutation_info_labels column
data['mutation_info_labels_orig'] = data['mutation_info_labels']
@ -172,18 +219,6 @@ else:
# Drop mutation column
data2.drop(['mutation'], axis=1, inplace=True)
#%% Process lineage info
# add how many different lineages a sample is represented in?
# https://stackoverflow.com/questions/37189878/pandas-add-column-to-groupby-dataframe
# https://stackoverflow.com/questions/43847520/how-to-get-the-distinct-count-of-values-in-a-python-pandas-dataframe
data2.groupby('mutationinformation')['lineage'].size() # sample count
data2.groupby('mutationinformation')['sample'].size()
data2.groupby('mutationinformation')['lineage'].value_counts()
data2.groupby('mutationinformation')['lineage'].nunique()
data2['lin_count'] = data2['mutationinformation'].map(data2.groupby('mutationinformation')['lineage'].nunique())
#%% subset: equivalent of merged_df3?
# https://stackoverflow.com/questions/39900061/sort-lists-in-a-pandas-dataframe-column

View file

@ -1,26 +1,28 @@
sample,mutationinformation,position,pyrazinamide,mutation_info_labels,drtype,lineage
S1,M1A,1,0,DM,MDR,l1
S2,M1A,1,1,DM,Pre-MDR,l2
S3,M1A,1,1,OM,Sensitive,l1
S4,M1A,1,NA,OM,Other,l3
S5,M1A,1,1,OM,Pre-XDR,l2
S6,M1A,1,1,DM,XDR,l4
S7,M1B,1,NA,OM,MDR,l1
S8,M1B,1,1,DM,Other,l1
S9,M1B,1,NA,DM,Other,l2
S10,M1B,1,0,OM,Sensitive,l2
S11,M1C,1,NA,OM,Pre-XDR,l3
S12,M1C,1,NA,OM,Pre-XDR,l1
S13,M1C,1,1,OM,MDR,l1
S14,M1C,1,NA,DM,MDR,l2
S15,A2B,2,0,OM,Other,l4
S16,A2B,2,0,OM,XDR,l4
S17,A2C,2,NA,DM,Pre-MDR,l5
S18,A2C,2,1,DM,Pre-MDR,l1
S19,D3E,3,1,DM,XDR,l2
S20,D3E,3,NA,DM,MDR,l2
S21,D3E,3,NA,OM,Pre-MDR,l1
S22,D3P,3,0,OM,Pre-MDR,l2
S23,D3A,3,0,OM,Sensitive,l5
S24,P4A,4,NA,OM,Other,l6
S25,P5A,5,1,DM,Sensitive,l4
id,old,mutationinformation,position,pyrazinamide,mutation_info_labels,drtype,lineage_corrupt,lineage
S1,S1,M1A,1,0,DM,MDR,l1; l3; l4 ,l1
S2,S2,M1A,1,1,DM,Pre-MDR,l2,l2
S3,S3,M1A,1,1,OM,Sensitive,l1,l1
S4,S4,M1A,1,NA,OM,Other,l3,l3
S5,S5,M1A,1,1,OM,Pre-XDR,l2,l2
S6,S6,M1A,1,1,DM,XDR,l4,l3
S1,S7,M1B,1,NA,OM,MDR,l1,l1
S7,S8,M1B,1,1,DM,Other,l1,l1
S8,S9,M1B,1,NA,DM,Other,l2,l2
S2,S10,M1B,1,0,OM,Sensitive,l2,l2
S3,S11,M1C,1,NA,OM,Pre-XDR,l3,l3
S4,S12,M1C,1,NA,OM,Pre-XDR,l1,l1
S8,S13,M1C,1,1,OM,MDR,l1,l1
S8,S14,M1C,1,NA,DM,MDR,l2,l2
S4,S15,A2B,2,0,OM,Other,l4,l4
S3,S16,A2B,2,0,OM,XDR,l4,l4
S2,S17,A2C,2,NA,DM,Pre-MDR,l5,l5
S1,S18,A2C,2,1,DM,Pre-MDR,l1,l1
S7,S19,D3E,3,1,DM,XDR,l2,l2
S8,S20,D3E,3,NA,DM,MDR,l2,l2
S8,S21,D3E,3,NA,OM,Pre-MDR,l1,l1
S5,S22,D3P,3,0,OM,Pre-MDR,l2,l2
S6,S23,D3A,3,0,OM,Sensitive,l5,l5
S7,S24,P4A,4,NA,OM,Other,l6,l6
S8,S25,P5A,5,1,DM,Sensitive,l4,l4
S8,S26,Q6L,6,1,DM,Others,l2,l2
S4,S27,Q6L,6,NA,OM,MDR,l5; l2,l5

1 id sample old mutationinformation position pyrazinamide mutation_info_labels drtype lineage_corrupt lineage
2 S1 S1 M1A 1 0 DM MDR l1; l3; l4 l1
3 S2 S2 M1A 1 1 DM Pre-MDR l2 l2
4 S3 S3 M1A 1 1 OM Sensitive l1 l1
5 S4 S4 M1A 1 NA OM Other l3 l3
6 S5 S5 M1A 1 1 OM Pre-XDR l2 l2
7 S6 S6 M1A 1 1 DM XDR l4 l4 l3
8 S1 S7 M1B 1 NA OM MDR l1 l1
9 S7 S8 M1B 1 1 DM Other l1 l1
10 S8 S9 M1B 1 NA DM Other l2 l2
11 S2 S10 M1B 1 0 OM Sensitive l2 l2
12 S3 S11 M1C 1 NA OM Pre-XDR l3 l3
13 S4 S12 M1C 1 NA OM Pre-XDR l1 l1
14 S8 S13 M1C 1 1 OM MDR l1 l1
15 S8 S14 M1C 1 NA DM MDR l2 l2
16 S4 S15 A2B 2 0 OM Other l4 l4
17 S3 S16 A2B 2 0 OM XDR l4 l4
18 S2 S17 A2C 2 NA DM Pre-MDR l5 l5
19 S1 S18 A2C 2 1 DM Pre-MDR l1 l1
20 S7 S19 D3E 3 1 DM XDR l2 l2
21 S8 S20 D3E 3 NA DM MDR l2 l2
22 S8 S21 D3E 3 NA OM Pre-MDR l1 l1
23 S5 S22 D3P 3 0 OM Pre-MDR l2 l2
24 S6 S23 D3A 3 0 OM Sensitive l5 l5
25 S7 S24 P4A 4 NA OM Other l6 l6
26 S8 S25 P5A 5 1 DM Sensitive l4 l4
27 S8 S26 Q6L 6 1 DM Others l2 l2
28 S4 S27 Q6L 6 NA OM MDR l5; l2 l5