added lineage and af count accounting for corrupt data
This commit is contained in:
parent
28d0d68413
commit
409caaf0bc
2 changed files with 77 additions and 40 deletions
|
@ -11,19 +11,66 @@ import numpy as np
|
|||
from statistics import mean, median, mode
|
||||
from statistics import multimode
|
||||
from collections import Counter
|
||||
from tidy_split import tidy_split
|
||||
|
||||
#import math
|
||||
|
||||
# https://stackoverflow.com/questions/43321455/pandas-count-null-values-in-a-groupby-function
|
||||
# https://stackoverflow.com/questions/33457191/python-pandas-dataframe-fill-nans-with-a-conditional-mean
|
||||
# round up
|
||||
#int(math.ceil(mean(foo)))
|
||||
#https://stackoverflow.com/questions/33457191/python-pandas-dataframe-fill-nans-with-a-conditional-mean
|
||||
# int(math.ceil(mean(foo)))
|
||||
# https://stackoverflow.com/questions/33457191/python-pandas-dataframe-fill-nans-with-a-conditional-mean
|
||||
# https://stackoverflow.com/questions/37189878/pandas-add-column-to-groupby-dataframe
|
||||
# https://stackoverflow.com/questions/43847520/how-to-get-the-distinct-count-of-values-in-a-python-pandas-dataframe
|
||||
#%% Read data and formatting
|
||||
drug = "pyrazinamide"
|
||||
|
||||
data = pd.read_csv("/home/tanu/git/ML_AI_training/test_data/sample_data.csv")
|
||||
data.columns
|
||||
|
||||
data.head()
|
||||
#%% Quick checks: Lineage and sample count for each mutation
|
||||
data['id'].nunique()
|
||||
data['mutationinformation'].nunique()
|
||||
total_id_ucount = data['id'].nunique()
|
||||
total_id_ucount
|
||||
|
||||
data.groupby('mutationinformation')['lineage'].size()
|
||||
data.groupby('mutationinformation')['lineage_corrupt'].size()
|
||||
data.groupby('mutationinformation')['id'].size()
|
||||
data.groupby('mutationinformation')['lineage'].value_counts()
|
||||
data.groupby('mutationinformation')['lineage'].nunique()
|
||||
#%% id count: add all id ids and count of unique ids per mutation
|
||||
data['id_list'] = data['mutationinformation'].map(data.groupby('mutationinformation')['id'].apply(list))
|
||||
data['id_ucount'] = data['mutationinformation'].map(data.groupby('mutationinformation')['id'].nunique())
|
||||
data[['mutationinformation', 'id', 'id_list', 'id_ucount']]
|
||||
#%% Lineages: add all lineages and count of unique lineages per mutation
|
||||
# Lineages good: lineage column has only a single lineage for each mutationinformation
|
||||
data['lineage']
|
||||
data['lineage_list'] = data['mutationinformation'].map(data.groupby('mutationinformation')['lineage'].apply(list))
|
||||
data['lineage_ucount'] = data['mutationinformation'].map(data.groupby('mutationinformation')['lineage'].nunique())
|
||||
data[['mutationinformation', 'lineage', 'lineage_list', 'lineage_ucount']]
|
||||
|
||||
# Lineage corrupt: lineage column has only multiple lineages for each mutationinformation separated by ';'
|
||||
data['lineage_corrupt']
|
||||
# split using tidy_split()
|
||||
data_split = tidy_split(data, 'lineage_corrupt', sep = ';')
|
||||
# remove leading white space else these are counted as distinct mutations as well
|
||||
#data_split['lineage_corrupt'] = data_split['lineage_corrupt'].str.lstrip()
|
||||
data_split['lineage_corrupt'] = data_split['lineage_corrupt'].str.strip()
|
||||
data_split.head()
|
||||
|
||||
data_split['lineage_corrupt_list'] = data_split['mutationinformation'].map(data_split.groupby('mutationinformation')['lineage_corrupt'].apply(list))
|
||||
data_split['lineage_corrupt_ucount'] = data_split['mutationinformation'].map(data_split.groupby('mutationinformation')['lineage_corrupt'].nunique())
|
||||
|
||||
data_split[['mutationinformation', 'lineage_corrupt_list', 'lineage_corrupt_ucount']]
|
||||
data_split[['mutationinformation', 'lineage_ucount', 'lineage_corrupt_ucount']]
|
||||
|
||||
#%% AF: calculate AF for each mutation
|
||||
#1) calculate no. of unique ids
|
||||
data['id_ucount']/total_id_ucount
|
||||
|
||||
#%% DM OM labels
|
||||
# COPY mutation_info_labels column
|
||||
data['mutation_info_labels_orig'] = data['mutation_info_labels']
|
||||
|
||||
|
@ -172,18 +219,6 @@ else:
|
|||
|
||||
# Drop mutation column
|
||||
data2.drop(['mutation'], axis=1, inplace=True)
|
||||
|
||||
#%% Process lineage info
|
||||
# add how many different lineages a sample is represented in?
|
||||
# https://stackoverflow.com/questions/37189878/pandas-add-column-to-groupby-dataframe
|
||||
# https://stackoverflow.com/questions/43847520/how-to-get-the-distinct-count-of-values-in-a-python-pandas-dataframe
|
||||
data2.groupby('mutationinformation')['lineage'].size() # sample count
|
||||
data2.groupby('mutationinformation')['sample'].size()
|
||||
data2.groupby('mutationinformation')['lineage'].value_counts()
|
||||
|
||||
data2.groupby('mutationinformation')['lineage'].nunique()
|
||||
data2['lin_count'] = data2['mutationinformation'].map(data2.groupby('mutationinformation')['lineage'].nunique())
|
||||
|
||||
#%% subset: equivalent of merged_df3?
|
||||
# https://stackoverflow.com/questions/39900061/sort-lists-in-a-pandas-dataframe-column
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue