more tidying and formatting for combining_dfs.py. Hopefully no more after today

This commit is contained in:
Tanushree Tunstall 2022-01-11 17:51:28 +00:00
parent c48fa1dbb0
commit 1f266c4cb8
2 changed files with 217 additions and 84 deletions

View file

@ -227,26 +227,97 @@ outfile_stab_struc = outdir + out_filename_stab_struc
# end of variable assignment for input and output files
#%%############################################################################
print('\n==================================================='
, '\nCombining dfs for gene:', gene.lower()
, '\nDrug name:', drug
, '\n====================================================')
#=====================
# some preprocessing
# variable setting
#=====================
gene_pdb_f = gene.lower() + '_complex.pdb'
gene_with_offset = ['alr', 'katg', 'rpob']
#geneL_snap2 = ['alr', 'katg', 'rpob']
gene_with_offset_missing_aa = ['embb']
if gene.lower() == 'alr':
offset_val = 34
if gene.lower() == 'katg':
offset_val = 23
if gene.lower() == 'rpob':
offset_val = 28
if gene.lower() == 'embb':
offset_val = 23
missing_aa_start = 248
missing_aa_end = 269
missing_res_offset = missing_aa_end - missing_aa_start
if gene.lower() == 'gid':
offset_val = 0
if gene.lower() == 'pnca':
offset_val = 0
#===========
# DSSP
#===========
print('\nFormatting dssp_df:'
, '\nSelecting chain:', sel_chain, 'for gene', gene.lower())
dssp_df = dssp_df_raw[dssp_df_raw['chain_id'] == sel_chain]
print('\nDim of dssp_df:', dssp_df.shape)
dssp_df['chain_id'].value_counts()
#===========
# KD
#===========
kd_df.shape
if gene.lower() in gene_with_offset:
print ('\nRunning analysis for gene with offset:', gene.lower()
, '\nOffset value:', offset_val
, '\nFormatting kd_df with offset value'
, kd_df.shape)
kd_df['position'] = kd_df['position'] + offset_val
# geneL_kd = ['alr']
# if gene.lower() in geneL_kd:
# print('\nRunning gene:', gene.lower()
# ,'\nChecking start numbering')
if gene.lower() in gene_with_offset_missing_aa:
print ('\nRunning analysis for gene with offset and missing aa:', gene.lower()
, '\nOffset value:', offset_val
, '\nMissing residues gap:', missing_res_offset
, '\nFormatting kd_df with offset value'
, '\nDim of kd_df:', kd_df.shape)
if kd_df['wild_type_kd'].str.contains('X').any():
print('\nDetected X in wild_type_kd'
, '\nRunning gene:', gene.lower()
, '\nChecking start numbering')
kd_df = kd_df[~kd_df['wild_type_kd'].str.contains('X')]
# First correct offset, then correct GAP
print('\nFirst correcting offset')
kd_df['position'] = kd_df['position'] + offset_val
print('\nNow correcting gap')
kd_df.loc[kd_df['position'] >= 248, 'position'] = kd_df['position'] + missing_res_offset
else:
print ('\nRunning analysis for gene WITHOUT offset:', gene.lower()
, '\nNo offset value:'
, '\nUsing kd_df as is read'
, kd_df.shape)
print('\nChecking the position numbers from the 3 dfs: dssp, kd and rd')
dssp_kd_rd_pos_checks = pd.DataFrame ({"dssp_pos": dssp_df['position']
, "kd_pos": kd_df['position']
, "rd_pos": rd_df['position']})
dkr_pos_check = [
dssp_kd_rd_pos_checks['dssp_pos'].equals(dssp_kd_rd_pos_checks['kd_pos'])
, dssp_kd_rd_pos_checks['dssp_pos'].equals(dssp_kd_rd_pos_checks['rd_pos'])
, dssp_kd_rd_pos_checks['kd_pos'].equals(dssp_kd_rd_pos_checks['rd_pos'])
]
if all(dkr_pos_check):
print('\nPASS: position check passed for dssp, rd and kd dfs ')
else:
print('\nFAIL: position check Failed for dssp, rd and kd dfs'
, '\nNumbering mismatch. Please check these dfs.')
sys.exit()
#%%============================================================================
#===========
# FoldX
#===========
@ -439,26 +510,14 @@ consurf_df.shape
consurf_df.columns
print('\nRenaming cols and assigning pretty column names')
geneL_consurf = ['alr', 'katg', 'rpob']
if gene.lower() in geneL_consurf:
if gene.lower() in gene_with_offset:
consurf_df = consurf_df.rename(columns={'POS' : 'position_consurf'})
#---------------------------
# Specify the offset
#---------------------------
print('\nAdding offset value for gene:', gene.lower())
if gene.lower() == 'alr':
offset_val = 34
print('\nUsing offset val:', offset_val)
if gene.lower() == 'katg':
offset_val = 23
print('\nUsing offset val:', offset_val)
if gene.lower() == 'rpob':
offset_val = 28
print('\nUsing offset val:', offset_val)
print('\nFormatting consurf df'
, '\nAdding offset value for gene:', gene.lower()
, '\nUsing offset val:', offset_val)
consurf_df['position'] = consurf_df['position_consurf'] + offset_val
@ -586,9 +645,7 @@ snap2_df.shape
#----------------------
# rename colums
#----------------------
geneL_snap2 = ['alr', 'katg', 'rpob']
if gene.lower() in geneL_snap2:
if gene.lower() in gene_with_offset:
print('\nReading SNAP2 for gene:', gene.lower()
, '\nOffset column also being read'
, '\nRenaming columns...'
@ -665,9 +722,7 @@ else:
snap2_df.dtypes
snap2_df.columns
geneL_snap2 = ['alr', 'katg', 'rpob']
if gene.lower() in geneL_snap2:
if gene.lower() in gene_with_offset :
print('\nSelecting cols SNAP2 for gene:', gene.lower())
snap2_df_f = snap2_df[['mutationinformation'
, 'mutationinformation_snap2'
@ -685,7 +740,6 @@ else:
#%%============================================================================
# Now merges begin
print('==================================='
, '\nFirst merge: mcsm + foldx'
, '\n===================================')
@ -715,7 +769,7 @@ print('\n\nResult of first merge:', mcsm_foldx_dfs.shape
mcsm_foldx_dfs[merging_cols_m1].apply(len)
mcsm_foldx_dfs[merging_cols_m1].apply(len) == len(mcsm_foldx_dfs)
#%% for embB and any other targets where mCSM-lig hasn't run for ALL nsSNPs.
#%% for embB: mCSM-lig hasn't run for ALL nsSNPs.
# Get the empty cells to be full of meaningful info
if mcsm_foldx_dfs.loc[:,'wild_type': 'mut_aa_3lower'].isnull().values.any():
print ('\nNAs detected in mcsm cols after merge.'
@ -776,9 +830,8 @@ mcsm_foldx_deepddg_dfs = pd.merge(mcsm_foldx_dfs
, deepddg_df
, on = 'mutationinformation'
, how = "left")
mcsm_foldx_deepddg_dfs['deepddg_outcome'].value_counts()
ncols_deepddg_merge = len(mcsm_foldx_deepddg_dfs.columns)
mcsm_foldx_deepddg_dfs['deepddg_outcome'].value_counts()
mcsm_foldx_deepddg_dfs['position'] = mcsm_foldx_deepddg_dfs['position'].astype('int64')
@ -792,9 +845,6 @@ dssp_df_raw.shape
kd_df.shape
rd_df.shape
dssp_df = dssp_df_raw[dssp_df_raw['chain_id'] == sel_chain]
dssp_df['chain_id'].value_counts()
#dssp_kd_dfs = combine_dfs_with_checks(dssp_df, kd_df, my_join = "outer")
merging_cols_m2 = detect_common_cols(dssp_df, kd_df)
dssp_kd_dfs = pd.merge(dssp_df
@ -826,18 +876,16 @@ dssp_kd_rd_dfs[merging_cols_m3].apply(len) == len(dssp_kd_rd_dfs)
#%%============================================================================
print('==================================='
, '\nFourth merge*: fourth merge + consurf_df'
, '\nFifth merge: fourth merge + consurf_df'
, '\dssp_kd_rd_dfs + consurf_df'
, '\n===================================')
#dssp_kd_rd_dfs = combine_dfs_with_checks(dssp_kd_dfs, rd_df, my_join = "outer")
merging_cols_m3_v2 = detect_common_cols(dssp_kd_rd_dfs, consurf_df)
dssp_kd_rd_con_dfs = pd.merge(dssp_kd_rd_dfs
, consurf_df
, consurf_df_f
, on = merging_cols_m3_v2
, how = "outer")
ncols_m3_v2 = len(dssp_kd_rd_con_dfs.columns)
print('\n\nResult of fourth merge*:', dssp_kd_rd_con_dfs.shape
, '\n===================================================================')
dssp_kd_rd_con_dfs[merging_cols_m3_v2].apply(len)
@ -845,8 +893,8 @@ dssp_kd_rd_con_dfs[merging_cols_m3_v2].apply(len) == len(dssp_kd_rd_con_dfs)
#%%============================================================================
print('======================================='
, '\nFifth merge: Second merge + fourth merge'
, '\nmcsm_foldx_dfs + dssp_kd_rd_dfs'
, '\nSixth merge: Second merge + fifth merge'
, '\nmcsm_foldx_dfs + dssp_kd_rd_con_dfs'
, '\n=======================================')
#combined_df = combine_dfs_with_checks(mcsm_foldx_dfs, dssp_kd_rd_dfs, my_join = "inner")
@ -854,56 +902,109 @@ print('======================================='
#combined_df = pd.merge(mcsm_foldx_dfs, dssp_kd_rd_dfs, on = merging_cols_m4, how = "inner")
#combined_df_expected_cols = ncols_m1 + ncols_m3 - len(merging_cols_m4)
# with deepddg values
merging_cols_m4 = detect_common_cols(mcsm_foldx_deepddg_dfs, dssp_kd_rd_dfs)
combined_df = pd.merge(mcsm_foldx_deepddg_dfs
, dssp_kd_rd_dfs
# Detect merging cols
merging_cols_m4 = detect_common_cols(mcsm_foldx_deepddg_dfs, dssp_kd_rd_con_dfs)
if len(merging_cols_m4) > 1 and all(x in merging_cols_m4 for x in ['chain', 'position', 'wild_type']) :
print('\nlength of merging_cols_m4 is > 1 and has mixed dtype'
, '\nDropping chain and wild_type from dssp_kd_rd_con_dfs')
dssp_kd_rd_con_dfs = dssp_kd_rd_con_dfs.drop(['chain', 'wild_type'], axis = 1)
print('\nDetecting merging cols again')
merging_cols_m4 = detect_common_cols(mcsm_foldx_deepddg_dfs, dssp_kd_rd_con_dfs)
print('\nMerging cols length:', len(merging_cols_m4))
if len(merging_cols_m4) == 1 and 'position' in merging_cols_m4:
print('\nMerging column length == 1. Using it'
, '\nMerging column name:', merging_cols_m4)
else:
sys.exit('\nFAIL: to merge mcsm_foldx_deepddg and dssp_kd_rd_con_dfs. Please check')
combined_7dfs = pd.merge(mcsm_foldx_deepddg_dfs
, dssp_kd_rd_con_dfs
, on = merging_cols_m4
, how = "inner")
combined_df_expected_cols = ncols_deepddg_merge + ncols_m3 - len(merging_cols_m4)
if len(combined_7dfs.columns) == len(mcsm_foldx_deepddg_dfs.columns) + len(dssp_kd_rd_con_dfs.columns) - len(merging_cols_m4):
print('\n Seven dfs successfully combined:'
, '\nmcsm-lig', '\nfoldx', '\ndeepddg'
, '\ndssp', '\nkd', '\nrd'
, '\nconsurf')
else:
print('\nSomething went wrong with the seventh merge'
, '\nPlease check individual merges')
sys.exit()
#%%============================================================================
print('======================================='
, '\nSeventh merge: Sixth merge + snap2_df_f'
, '\ncombined_7dfs + snap_df_f'
, '\n=======================================')
# Check: whether logic effects anything else!
# Detect merging cols
merging_cols_m4_v2 = detect_common_cols(combined_7dfs, snap2_df_f)
if len(merging_cols_m4_v2) == 1:
print ('\nProceeding with merging SNAP2 df with 7 already combined dfs')
else:
print('\nHalting Seventh merge. Please check merging cols'
,'\nLength of merging_cols_m4_v2 is:', len(merging_cols_m4_v2))
sys.exit()
combined_df = pd.merge(combined_7dfs
, snap2_df_f
, on = merging_cols_m4_v2
, how = "inner")
combined_df_expected_cols = len(combined_7dfs.columns) + len(snap2_df_f.columns) - len(merging_cols_m4_v2)
#%% Dimension checks for combined_df:
# CHECK whether logic effects anything else!
if not gene == "embB":
print("\nGene is:", gene)
if len(combined_df) == len(mcsm_df) and len(combined_df.columns) == combined_df_expected_cols:
print('PASS: successfully combined 5 dfs'
print('PASS: successfully combined 8 dfs'
, '\nNo. of rows combined_df:', len(combined_df)
, '\nNo. of cols combined_df:', len(combined_df.columns))
else:
#sys.exit('FAIL: check individual df merges')
print("\nGene is:", gene
, "\ncombined_df length:", len(combined_df)
, "\nmcsm_df_length:", len(mcsm_df)
)
if len(combined_df.columns) == combined_df_expected_cols:
print('PASS: successfully combined 5 dfs'
, '\nNo. of rows combined_df:', len(combined_df)
, '\nNo. of cols combined_df:', len(combined_df.columns))
else:
sys.exit('FAIL: check individual merges')
elif len(combined_df) == len(foldx_df) and len(combined_df.columns) == combined_df_expected_cols:
print('PASS: successfully combined 8 dfs'
, '\nNo. of rows combined_df:', len(combined_df)
, '\nNo. of cols combined_df:', len(combined_df.columns))
else:
sys.exit('FAIL: check individual merges for seventh merge')
print('\nResult of Fourth merge:', combined_df.shape
print('\nResult of Seventh merge:', combined_df.shape
, '\n===================================================================')
combined_df[merging_cols_m4].apply(len)
combined_df[merging_cols_m4].apply(len) == len(combined_df)
# should be TRUE and length match mcsm_df len
combined_df[merging_cols_m4_v2].apply(len)
combined_df[merging_cols_m4_v2].apply(len) == len(combined_df)
#%%============================================================================
# Format the combined df columns
combined_df_colnames = combined_df.columns
# check redundant columns
combined_df['chain'].equals(combined_df['chain_id'])
combined_df['wild_type'].equals(combined_df['wild_type_kd']) # has nan
combined_df['wild_type'].equals(combined_df['wild_type_kd']) # should not have nan
combined_df['wild_type'].equals(combined_df['wild_type_dssp'])
# sanity check
foo = combined_df[['wild_type', 'wild_type_kd', 'wt_3letter_caps', 'wt_aa_3lower', 'mut_aa_3lower']]
# Drop cols
cols_to_drop = ['chain_id', 'wild_type_kd', 'wild_type_dssp', 'wt_3letter_caps']
cols_to_drop = ['chain_id', 'wild_type_kd', 'wild_type_dssp'
, 'wt_3letter_caps', 'wt_aa_3lower', 'mut_aa_3lower']
print('\nDropping:', len(cols_to_drop), 'cols to get clean_df for futher combining'
, '\nncols before dropping:', len(combined_df.columns)
, '\nDropped column names:', cols_to_drop)
combined_df_clean = combined_df.drop(cols_to_drop, axis = 1)
combined_df_clean.columns
print('\nncols after dropping:', len(combined_df_clean.columns))
combined_df_clean_colnames = combined_df_clean.columns
del(foo)
#%%============================================================================
#---------------------
@ -966,7 +1067,7 @@ if len(combined_stab_afor) == len(combined_df_clean) and len(combined_stab_afor.
else:
sys.exit('\nFAIL: check individual df merges')
print('\n\nResult of Fifth merge:', combined_stab_afor.shape
print('\n\nResult of Seventh merge:', combined_stab_afor.shape
, '\n===================================================================')
combined_stab_afor[merging_cols_m5].apply(len)
@ -1029,8 +1130,7 @@ dfs_merged = reduce(lambda left,right: pd.merge(left
, right
, on = ['mutationinformation']
#, how = 'inner')
, how = join_type)
, dfs_list)
, how = join_type) , dfs_list)
# drop excess columns
drop_cols = detect_common_cols(dfs_merged, combined_stab_afor)
drop_cols.remove('mutationinformation')
@ -1038,8 +1138,8 @@ drop_cols.remove('mutationinformation')
dfs_merged_clean = dfs_merged.drop(drop_cols, axis = 1)
merging_cols_m6 = detect_common_cols(dfs_merged_clean, combined_stab_afor)
len(dfs_merged_clean.columns)
len(combined_stab_afor.columns)
len(dfs_merged_clean.columns)
combined_all_params = pd.merge(combined_stab_afor
, dfs_merged_clean
@ -1064,12 +1164,47 @@ if combined_all_params.columns.str.contains(r'_x$|_y$', regex = True).any():
, '\nDropping these before writing file')
extra_cols_to_drop = list(combined_all_params.columns.str.extract(r'(.*_x$|.*_y$)', expand = True).dropna()[0])
print('\nTotal cols:', len(combined_all_params.columns)
,'\nDropping:', len(extra_cols_to_drop), 'columns')
,'\nDropping:', len(extra_cols_to_drop), 'columns'
, '\nThese are:', extra_cols_to_drop)
#extra_cols_to_drop = ['chain_x', 'chain_y']
combined_all_params = combined_all_params.drop(extra_cols_to_drop, axis = 1)
else:
print('\nNo duplicate column names detected, just writing file'
, '\nTotal cols:', len(combined_all_params.columns) )
#%%==============================================================================
# Some final formatting for combined_all_params df
#---------------------------------------
# Add: Map 1 letter
# code to 3Upper for Mutant aa
#---------------------------------------
# initialise a sub dict that is lookup dict for
# 3-LETTER aa code to 1-LETTER aa code
lookup_dict = dict()
for k, v in oneletter_aa_dict.items():
lookup_dict[k] = v['three_letter_code_upper']
mut = combined_all_params['mutant_type'].squeeze()
combined_all_params['mut_3upper'] = mut.map(lookup_dict)
else:
print('\nFailed to add 3 letter upper code')
# Add offset value
combined_all_params['seq_offset4pdb'] = offset_val
# Drop extra columns
redundant_colsL = ['mutationinformation_snap2'
, 'wt_upper'
, 'mut_upper'
, 'pdb_file']
combined_all_params_f = combined_all_params.drop(redundant_colsL
, axis = 1
, errors = 'ignore')
# Add pdb_file name at the end
combined_all_params_f['pdb_file'] = gene_pdb_f
#%%============================================================================
#---------------------
# Output 3: write csv
@ -1078,9 +1213,9 @@ print('\nWriting file: all params')
print('\nOutput 3 filename:', outfile_comb
, '\n===================================================================\n')
combined_all_params.to_csv(outfile_comb, index = False)
combined_all_params_f.to_csv(outfile_comb, index = False)
print('\nFinished writing file:'
, '\nNo. of rows:', combined_all_params.shape[0]
, '\nNo. of cols:', combined_all_params.shape[1])
, '\nNo. of rows:', combined_all_params_f.shape[0]
, '\nNo. of cols:', combined_all_params_f.shape[1])
#%% end of script

View file

@ -5,8 +5,6 @@
#########################################################
# working dir and loading libraries
getwd()
#setwd("~/git/LSHTM_analysis/scripts/plotting")
source("/home/tanu/git/LSHTM_analysis/scripts/plotting/Header_TT.R")
#********************
@ -53,7 +51,7 @@ my_df_u = pd_df[[2]] # this forms one of the input for combining_dfs_plotting()
max_ang <- round(max(my_df_u[LigDist_colname]))
min_ang <- round(min(my_df_u[LigDist_colname]))
cat("\nLigand distance cut off, colname:", LigDist_colname
cat("\nLigand distance colname:", LigDist_colname
, "\nThe max distance", gene, "structure df" , ":", max_ang, "\u212b"
, "\nThe min distance", gene, "structure df" , ":", min_ang, "\u212b")