horrible lineage analysis hell

This commit is contained in:
Tanushree Tunstall 2022-06-28 21:51:02 +01:00
parent ce0f12382e
commit 478df927cc
10 changed files with 1669 additions and 101 deletions

View file

@ -60,9 +60,12 @@ import collections
#%% dir and local imports
homedir = os.path.expanduser('~')
# set working dir
os.getcwd()
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
os.getcwd()
# os.getcwd()
# os.chdir(homedir + '/git/LSHTM_analysis/scripts')
# os.getcwd()
sys.path.append(homedir + '/git/LSHTM_analysis/scripts/')
#=======================================================================
#%% command line args
arg_parser = argparse.ArgumentParser()
@ -1550,6 +1553,29 @@ gene_LF3['dst_multimode'].value_counts()
#gene_LF3['dst_noNA'] = gene_LF3['dst_multimode'].apply(lambda x: np.nanmax(x))
gene_LF3['dst_mode'] = gene_LF3['dst_multimode'].apply(lambda x: np.nanmax(x)) #ML
#-----------------------------------------------------------------------------
#-----------------------------------------------------------------------------
# NOTE: unexpected weirdness with above, so redoing it!
mmdf = pd.DataFrame(gene_LF3.groupby('mutationinformation')['dst_mode'].agg(multimode))
mmdf['dst2'] = mmdf['dst_mode'].apply(lambda x: int(max(x)))
mmdf=mmdf.reset_index()
# rename cols to make sure merge will have the names you expect
mmdf2 = mmdf.rename(columns = {'dst_mode':'dst_multimode', 'dst2':'dst_mode'})
# IMPORTANT!
gene_LF3_copy = gene_LF3.copy()
gene_LF3_copy.drop(["dst_mode", "dst_multimode", "dst_multimode_all"], axis = 1, inplace = True)
# Now merge gene_LF3.copy and mmdf2
gene_LF3_merged = pd.merge(gene_LF3_copy, mmdf2, on='mutationinformation')
df_check4 = gene_LF3_merged[['mutationinformation', 'dst', 'dst_multimode', 'dst_mode', 'position' ]]
# now reassign the merged df to gene_LF3 for integration with downstream
gene_LF3 = gene_LF3_merged.copy()
#-----------------------------------------------------------------------------
#-----------------------------------------------------------------------------
# sanity checks
#gene_LF3['dst_noNA'].equals(gene_LF3['dst_mode'])
gene_LF3[drug].value_counts()
@ -1700,10 +1726,24 @@ lf_lin_split['lineage_numeric'].value_counts()
# Add lineage_list: ALL values:
#--------------------------------
# Add all lineages for each mutation
lf_lin_split['lineage_corrupt_list'] = lf_lin_split['lineage_corrupt']
lf_lin_split['lineage_corrupt_list'].value_counts()
#lf_lin_split['lineage_corrupt_list'] = lf_lin_split['lineage_corrupt'].copy()
#lf_lin_split['lineage_corrupt_list'].value_counts()
lf_lin_split['lineage_corrupt'].value_counts()
#lf_lin_split['lineage_corrupt_list'] = lf_lin_split['mutationinformation'].map(lf_lin_split.groupby('mutationinformati
lf_lin_split['lineage_corrupt_list'] = lf_lin_split.groupby('mutationinformation').lineage_corrupt_list.apply(list)
#lf_lin_split['lineage_corrupt_list'] = lf_lin_split.groupby('mutationinformation').lineage_corrupt_list.apply(list)
lf_lin_tmp =lf_lin_split.groupby('Mut').lineage_corrupt.apply(list)
lf_lin_tmp = lf_lin_tmp.reset_index()
lf_lin_tmp.rename(columns={'lineage_corrupt': 'lineage_corrupt_list' }, inplace=True)
#lf_lin_split['lineage_corrupt_list'] = lf_lin_split.groupby('Mut').lineage_corrupt_list.apply(list).copy()
#lf_lin_split['lineage_corrupt_list'] = lf_lin_tmp
lf_lin_merged = pd.merge(lf_lin_split, lf_lin_tmp, on='Mut')
lf_lin_split.shape
lf_lin_merged.shape
# REASSIGN merged
lf_lin_split = lf_lin_merged.copy()
lf_lin_split['lineage_corrupt_list'].value_counts()
#--------------------------------
@ -1727,10 +1767,18 @@ lf_lin_split['lineage_ulist'] = lf_lin_split['lineage_set'].apply(lambda x : li
#-------------------------------------
# Lineage numeric mode: multimode
#-------------------------------------
lf_lin_split['lineage_multimode'] = lf_lin_split.groupby('mutationinformation')['lineage_numeric'].agg(multimode)
lf_lin_split['lineage_multimode'].value_counts()
#lf_lin_split['lineage_multimode'] = lf_lin_split.groupby('mutationinformation')['lineage_numeric'].agg(multimode)
#lf_lin_split['lineage_multimode'] = lf_lin_split.groupby('Mut')['lineage_numeric'].agg(multimode)
# cant take max as it doesn't mean anyting!
lin_mm_tmp = pd.DataFrame(lf_lin_split.groupby('Mut')['lineage_numeric'].agg(multimode))
lin_mm_tmp=lin_mm_tmp.reset_index()
lin_mm_tmp.rename(columns={'lineage_numeric':'lineage_multimode'}, inplace=True)
lf_lin_split_merged = pd.merge(lf_lin_split, lin_mm_tmp, on='Mut')
#lf_lin_split['lineage_multimode'].value_counts() # cant take max as it doesn't mean anyting!
#REASSIGN
lf_lin_split = lf_lin_split_merged.copy()
###############################################################################
#%% Select only the columns you want to merge from lf_lin_split