horrible lineage analysis hell
This commit is contained in:
parent
ce0f12382e
commit
478df927cc
10 changed files with 1669 additions and 101 deletions
|
@ -60,9 +60,12 @@ import collections
|
|||
#%% dir and local imports
|
||||
homedir = os.path.expanduser('~')
|
||||
# set working dir
|
||||
os.getcwd()
|
||||
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
|
||||
os.getcwd()
|
||||
# os.getcwd()
|
||||
# os.chdir(homedir + '/git/LSHTM_analysis/scripts')
|
||||
# os.getcwd()
|
||||
|
||||
sys.path.append(homedir + '/git/LSHTM_analysis/scripts/')
|
||||
|
||||
#=======================================================================
|
||||
#%% command line args
|
||||
arg_parser = argparse.ArgumentParser()
|
||||
|
@ -1550,6 +1553,29 @@ gene_LF3['dst_multimode'].value_counts()
|
|||
#gene_LF3['dst_noNA'] = gene_LF3['dst_multimode'].apply(lambda x: np.nanmax(x))
|
||||
gene_LF3['dst_mode'] = gene_LF3['dst_multimode'].apply(lambda x: np.nanmax(x)) #ML
|
||||
|
||||
#-----------------------------------------------------------------------------
|
||||
#-----------------------------------------------------------------------------
|
||||
# NOTE: unexpected weirdness with above, so redoing it!
|
||||
mmdf = pd.DataFrame(gene_LF3.groupby('mutationinformation')['dst_mode'].agg(multimode))
|
||||
mmdf['dst2'] = mmdf['dst_mode'].apply(lambda x: int(max(x)))
|
||||
mmdf=mmdf.reset_index()
|
||||
|
||||
# rename cols to make sure merge will have the names you expect
|
||||
mmdf2 = mmdf.rename(columns = {'dst_mode':'dst_multimode', 'dst2':'dst_mode'})
|
||||
|
||||
# IMPORTANT!
|
||||
gene_LF3_copy = gene_LF3.copy()
|
||||
gene_LF3_copy.drop(["dst_mode", "dst_multimode", "dst_multimode_all"], axis = 1, inplace = True)
|
||||
|
||||
# Now merge gene_LF3.copy and mmdf2
|
||||
gene_LF3_merged = pd.merge(gene_LF3_copy, mmdf2, on='mutationinformation')
|
||||
df_check4 = gene_LF3_merged[['mutationinformation', 'dst', 'dst_multimode', 'dst_mode', 'position' ]]
|
||||
|
||||
# now reassign the merged df to gene_LF3 for integration with downstream
|
||||
gene_LF3 = gene_LF3_merged.copy()
|
||||
#-----------------------------------------------------------------------------
|
||||
#-----------------------------------------------------------------------------
|
||||
|
||||
# sanity checks
|
||||
#gene_LF3['dst_noNA'].equals(gene_LF3['dst_mode'])
|
||||
gene_LF3[drug].value_counts()
|
||||
|
@ -1700,10 +1726,24 @@ lf_lin_split['lineage_numeric'].value_counts()
|
|||
# Add lineage_list: ALL values:
|
||||
#--------------------------------
|
||||
# Add all lineages for each mutation
|
||||
lf_lin_split['lineage_corrupt_list'] = lf_lin_split['lineage_corrupt']
|
||||
lf_lin_split['lineage_corrupt_list'].value_counts()
|
||||
#lf_lin_split['lineage_corrupt_list'] = lf_lin_split['lineage_corrupt'].copy()
|
||||
#lf_lin_split['lineage_corrupt_list'].value_counts()
|
||||
lf_lin_split['lineage_corrupt'].value_counts()
|
||||
|
||||
#lf_lin_split['lineage_corrupt_list'] = lf_lin_split['mutationinformation'].map(lf_lin_split.groupby('mutationinformati
|
||||
lf_lin_split['lineage_corrupt_list'] = lf_lin_split.groupby('mutationinformation').lineage_corrupt_list.apply(list)
|
||||
#lf_lin_split['lineage_corrupt_list'] = lf_lin_split.groupby('mutationinformation').lineage_corrupt_list.apply(list)
|
||||
lf_lin_tmp =lf_lin_split.groupby('Mut').lineage_corrupt.apply(list)
|
||||
lf_lin_tmp = lf_lin_tmp.reset_index()
|
||||
lf_lin_tmp.rename(columns={'lineage_corrupt': 'lineage_corrupt_list' }, inplace=True)
|
||||
#lf_lin_split['lineage_corrupt_list'] = lf_lin_split.groupby('Mut').lineage_corrupt_list.apply(list).copy()
|
||||
|
||||
#lf_lin_split['lineage_corrupt_list'] = lf_lin_tmp
|
||||
lf_lin_merged = pd.merge(lf_lin_split, lf_lin_tmp, on='Mut')
|
||||
lf_lin_split.shape
|
||||
lf_lin_merged.shape
|
||||
|
||||
# REASSIGN merged
|
||||
lf_lin_split = lf_lin_merged.copy()
|
||||
lf_lin_split['lineage_corrupt_list'].value_counts()
|
||||
|
||||
#--------------------------------
|
||||
|
@ -1727,10 +1767,18 @@ lf_lin_split['lineage_ulist'] = lf_lin_split['lineage_set'].apply(lambda x : li
|
|||
#-------------------------------------
|
||||
# Lineage numeric mode: multimode
|
||||
#-------------------------------------
|
||||
lf_lin_split['lineage_multimode'] = lf_lin_split.groupby('mutationinformation')['lineage_numeric'].agg(multimode)
|
||||
lf_lin_split['lineage_multimode'].value_counts()
|
||||
#lf_lin_split['lineage_multimode'] = lf_lin_split.groupby('mutationinformation')['lineage_numeric'].agg(multimode)
|
||||
#lf_lin_split['lineage_multimode'] = lf_lin_split.groupby('Mut')['lineage_numeric'].agg(multimode)
|
||||
|
||||
# cant take max as it doesn't mean anyting!
|
||||
lin_mm_tmp = pd.DataFrame(lf_lin_split.groupby('Mut')['lineage_numeric'].agg(multimode))
|
||||
lin_mm_tmp=lin_mm_tmp.reset_index()
|
||||
lin_mm_tmp.rename(columns={'lineage_numeric':'lineage_multimode'}, inplace=True)
|
||||
|
||||
lf_lin_split_merged = pd.merge(lf_lin_split, lin_mm_tmp, on='Mut')
|
||||
#lf_lin_split['lineage_multimode'].value_counts() # cant take max as it doesn't mean anyting!
|
||||
|
||||
#REASSIGN
|
||||
lf_lin_split = lf_lin_split_merged.copy()
|
||||
|
||||
###############################################################################
|
||||
#%% Select only the columns you want to merge from lf_lin_split
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue