handled rpob 5uhc position offset in mcsm_ppi2

This commit is contained in:
Tanushree Tunstall 2022-01-04 10:45:29 +00:00
parent 46e2c93885
commit 00b84ccb1c
30 changed files with 395 additions and 63 deletions

View file

@ -24,7 +24,7 @@ from reference_dict import up_3letter_aa_dict
from reference_dict import oneletter_aa_dict
#%%============================================================================
def format_mcsm_ppi2_output(mcsm_ppi2_output_csv):
def format_mcsm_ppi2_output(mcsm_ppi2_output_csv, gene_name):
"""
@param mcsm_ppi2_output_csv: file containing mcsm_ppi2_results for all mcsm snps
which is the result of combining all mcsm_ppi2 batch results, and using
@ -78,30 +78,57 @@ def format_mcsm_ppi2_output(mcsm_ppi2_output_csv):
# # check
# mcsm_ppi2_data['wild-type'].equals(mcsm_ppi2_data['WILD'])
# mcsm_ppi2_data['mutant'].equals(mcsm_ppi2_data['MUT'])
#%%============================================================================
#############
# rename cols
#############
# format colnames: all lowercase and consistent colnames
mcsm_ppi2_data.columns
print('Assigning meaningful colnames'
, '\n=======================================================')
my_colnames_dict = {'chain': 'chain'
, 'wild-type': 'wt_upper'
, 'res-number': 'position'
, 'mutant': 'mut_upper'
, 'distance-to-interface': 'interface_dist'
, 'mcsm-ppi2-prediction': 'mcsm_ppi2_affinity'
, 'affinity': 'mcsm_ppi2_outcome'
, 'w_type': 'wild_type' # one letter amino acid code
, 'm_type': 'mutant_type' # one letter amino acid code
}
# mcsm_ppi2_data['mutant'].equals(mcsm_ppi2_data['MUT'])
#%%=====================================================================
# add offset specified position number for rpob since 5uhc with chain 'C' was
# used to run the analysis
geneL_sp = ['rpob']
if gene_name.lower() in geneL_sp:
offset = 6
chain_orig = 'A'
# Add offset corrected position number. matching with rpob nsSNPs used for mCSM-lig
# and also add corresponding chain id matching with rpob nsSNPs used for mCSM-lig
mcsm_ppi2_data['position'] = mcsm_ppi2_data['res-number'] - offset
mcsm_ppi2_data['chain'] = chain_orig
mcsm_ppi2_data['5uhc_offset'] = offset
#############
# rename cols
#############
# format colnames: all lowercase and consistent colnames
mcsm_ppi2_data.columns
print('Assigning meaningful colnames'
, '\n=======================================================')
my_colnames_dict = {'chain' : 'chain'
, 'position' : 'position'
, '5uhc_offset' : '5uhc_offset'
, 'wild-type' : 'wt_upper'
, 'res-number' : '5uhc_position'
, 'mutant' : 'mut_upper'
, 'distance-to-interface': 'interface_dist'
, 'mcsm-ppi2-prediction' : 'mcsm_ppi2_affinity'
, 'affinity' : 'mcsm_ppi2_outcome'
, 'w_type' : 'wild_type' # one letter amino acid code
, 'm_type' : 'mutant_type' # one letter amino acid code
}
else:
my_colnames_dict = {'chain' : 'chain'
, 'wild-type' : 'wt_upper'
, 'res-number' : 'position'
, 'mutant' : 'mut_upper'
, 'distance-to-interface': 'interface_dist'
, 'mcsm-ppi2-prediction' : 'mcsm_ppi2_affinity'
, 'affinity' : 'mcsm_ppi2_outcome'
, 'w_type' : 'wild_type' # one letter amino acid code
, 'm_type' : 'mutant_type' # one letter amino acid code
}
#%%==============================================================================
mcsm_ppi2_data.rename(columns = my_colnames_dict, inplace = True)
mcsm_ppi2_data.columns
#############
# create mutationinformation column
#############
@ -137,22 +164,47 @@ def format_mcsm_ppi2_output(mcsm_ppi2_output_csv):
, '\nExpected number:', mcsm_ppi2_pos
, '\nGot:', mcsm_ppi2_pos2
, '\n======================================================')
#%%=====================================================================
#############
###################
# reorder columns
#############
###################
mcsm_ppi2_data.columns
mcsm_ppi2_dataf = mcsm_ppi2_data[['mutationinformation'
, 'mcsm_ppi2_affinity'
, 'mcsm_ppi2_scaled'
, 'mcsm_ppi2_outcome'
, 'interface_dist'
, 'wild_type'
, 'position'
, 'mutant_type'
, 'wt_upper'
, 'mut_upper'
, 'chain']]
#---------------------
# Determine col order
#---------------------
core_cols = ['mutationinformation'
, 'mcsm_ppi2_affinity'
, 'mcsm_ppi2_scaled'
, 'mcsm_ppi2_outcome'
, 'interface_dist'
, 'wild_type'
, 'position'
, 'mutant_type'
, 'wt_upper'
, 'mut_upper'
, 'chain']
if gene_name.lower() in geneL_sp:
column_order = core_cols + ['5uhc_offset', '5uhc_position']
else:
column_order = core_cols.copy()
#--------------
# reorder now
#--------------
mcsm_ppi2_dataf = mcsm_ppi2_data[column_order]
#%%============================================================================
###################
# Sort df based on
# position columns
###################
mcsm_ppi2_dataf.sort_values(by = ['position', 'mutant_type'], inplace = True, ascending = True)
return(mcsm_ppi2_dataf)
#%%#####################################################################
#%%#####################################################################

View file

@ -67,7 +67,7 @@ outfile_mcsm_ppi2_f = outdir_ppi2 + gene.lower() + '_complex_mcsm_ppi2_norm.csv'
# Data: gid+streptomycin
#==========================
print('Formatting results for:', infile_mcsm_ppi2)
mcsm_ppi2_df_f = format_mcsm_ppi2_output(mcsm_ppi2_output_csv = infile_mcsm_ppi2)
mcsm_ppi2_df_f = format_mcsm_ppi2_output(mcsm_ppi2_output_csv = infile_mcsm_ppi2, gene_name = gene)
# writing file
print('Writing formatted df to csv')