handled rpob 5uhc position offset in mcsm_ppi2

This commit is contained in:
Tanushree Tunstall 2022-01-04 10:45:29 +00:00
parent 46e2c93885
commit 00b84ccb1c
30 changed files with 395 additions and 63 deletions

View file

@ -53,7 +53,7 @@ homedir = os.path.expanduser('~')
# set working dir
os.getcwd()
#os.chdir(homedir + '/git/LSHTM_analysis/scripts')
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
os.getcwd()
# FIXME: local imports
@ -170,6 +170,18 @@ infilename_mcsm_f_snps = gene.lower() + '_mcsm_formatted_snps.csv'
infile_mcsm_f_snps = outdir + infilename_mcsm_f_snps
mcsm_f_snps = pd.read_csv(infile_mcsm_f_snps, sep = ',', names = ['mutationinformation'], header = None)
# more output added
## consurf [change colnames]
infilename_consurf = gene.lower() + '_consurf_grades_f.csv'
infile_consurf = outdir + 'consurf/'+ infilename_consurf
consurf_df = pd.read_csv(infile_consurf, sep = ',')
## SNAP2 [add normalised score]
infilename_snap2 = gene.lower() + '_snap2_output.csv'
infile_snap2 = outdir + 'snap2/'+ infilename_snap2
snap2_df = pd.read_csv(infile_snap2, sep = ',')
#------------------------------------------------------------------------------
# ONLY:for gene pnca and gid: End logic should pick this up!
geneL_na = ['gid', 'rpob']
@ -196,7 +208,7 @@ if gene.lower() in geneL_dy:
# infile_mcsm_na = outdir + 'mcsm_na_results/' + infilename_mcsm_na
# mcsm_na_df = pd.read_csv(infile_mcsm_na, sep = ',')
# ONLY:for gene embb and alr: End logic should pick this up!
# ONLY:for gene embb and alr and katg: End logic should pick this up!
geneL_ppi2 = ['embb', 'alr']
#if gene.lower() == "embb" or "alr":
if gene.lower() in geneL_ppi2:
@ -381,6 +393,247 @@ else:
if deepddg_df['deepddg_scaled'].min() == -1 and deepddg_df['deepddg_scaled'].max() == 1:
print('\nPASS: Deepddg data is scaled between -1 and 1',
'\nproceeding with merge')
#=======================
# Consurf
#=======================
consurf_df.shape
# drop row 0: as it contains no value but hangover text
consurf_df = consurf_df.drop(index=0)
#----------------------
# rename colums
#----------------------
consurf_df.columns
print('\nRenaming cols and assigning pretty column names')
geneL_consurf = ['alr', 'katg', 'rpob']
if gene.lower() in geneL_consurf:
consurf_df = consurf_df.rename(columns={'POS' : 'position_consurf'})
#---------------------------
# Specify the offset
#---------------------------
print('\nAdding offset value for gene:', gene.lower())
if gene.lower() == 'alr':
offset_val = 34
print('\nUsing offset val:', offset_val)
if gene.lower() == 'katg':
offset_val = 23
print('\nUsing offset val:', offset_val)
if gene.lower() == 'rpob':
offset_val = 28
print('\nUsing offset val:', offset_val)
consurf_df['position'] = consurf_df['position_consurf'] + offset_val
else:
consurf_df = consurf_df.rename(columns={'POS' : 'position'})
consurf_df = consurf_df.rename(columns={'SEQ' : 'wild_type'
, '3LATOM': 'wt_3upper'
, 'SCORE' : 'consurf_score'
, 'COLOR' : 'consurf_colour_str'
, 'CONFIDENCEINTERVAL' : 'consurf_ci'
, 'CONFIDENCEINTERVALCOLORS' : 'consurf_ci_colour'
, 'MSADATA' : 'consurf_msa_data'
, 'RESIDUEVARIETY' : 'consurf_aa_variety'})
# quick check
if len(consurf_df) == len(rd_df):
print('\nPASS: length of consurf df is as expected'
, '\nProceeding to format consurf df')
else:
print('\nFAIL: length mismatch'
, '\nExpected nrows:', len(rd_df)
, '\nGot:', len(consurf_df))
consurf_df.dtypes
consurf_df['consurf_score'] = consurf_df['consurf_score'].astype(float)
consurf_df['consurf_colour'] = consurf_df['consurf_colour_str'].str.extract(r'(\d).*')
consurf_df['consurf_colour'] = consurf_df['consurf_colour'].astype(int)
consurf_df['consurf_colour_rev'] = consurf_df['consurf_colour_str'].str.replace(r'.\*','0')
consurf_df['consurf_colour_rev'] = consurf_df['consurf_colour_rev'].astype(int)
consurf_df['consurf_ci_upper'] = consurf_df['consurf_ci'].str.extract(r'(.*):')
consurf_df['consurf_ci_upper'] = consurf_df['consurf_ci_upper'].astype(float)
consurf_df['consurf_ci_lower'] = consurf_df['consurf_ci'].str.extract(r':(.*)')
consurf_df['consurf_ci_lower'] = consurf_df['consurf_ci_lower'].astype(float)
#consurf_df['wt_3upper_f'] = consurf_df['wt_3upper'].str.extract(r'^\w{3}(\d+.*)')
#consurf_df['wt_3upper_f']
consurf_df['wt_3upper'] = consurf_df['wt_3upper'].str.replace(r'(\d+:.*)', '')
consurf_df['chain'] = consurf_df['wt_3upper'].str.extract(r':(.*)')
#-------------------------
# scale consurf values
#-------------------------
# Rescale values in consurf_score col b/w -1 and 1 so negative numbers
# stay neg and pos numbers stay positive
consurf_min = consurf_df['consurf_score'].min()
consurf_max = consurf_df['consurf_score'].max()
consurf_min
consurf_max
# quick check
len(consurf_df.loc[consurf_df['consurf_score'] >= 0])
len(consurf_df.loc[consurf_df['consurf_score'] < 0])
consurf_scale = lambda x : x/abs(consurf_min) if x < 0 else (x/consurf_max if x >= 0 else 'failed')
consurf_df['consurf_scaled'] = consurf_df['consurf_score'].apply(consurf_scale)
print('\nRaw consurf scores:\n', consurf_df['consurf_score']
, '\n---------------------------------------------------------------'
, '\nScaled consurf scores:\n', consurf_df['consurf_scaled'])
# additional check added
csmi = consurf_df['consurf_scaled'].min()
csma = consurf_df['consurf_scaled'].max()
c = consurf_df[consurf_df['consurf_score']>=0].count()
consurf_pos = c.get(key = 'consurf_score')
c2 = consurf_df[consurf_df['consurf_scaled']>=0].count()
consurf_pos2 = c2.get(key = 'consurf_scaled')
if consurf_pos == consurf_pos2 and csmi == -1 and csma == 1:
print('\nPASS: Consurf values scaled correctly b/w -1 and 1')
else:
print('\nFAIL: Consurf values scaled numbers MISmatch'
, '\nExpected number:', consurf_pos
, '\nGot:', consurf_pos2
, '\n======================================================')
consurf_df.dtypes
consurf_df.columns
#---------------------------
# select columns
# (and also determine order)
#---------------------------
consurf_df_f = consurf_df[['position'
, 'wild_type'
, 'chain'
, 'wt_3upper'
, 'consurf_score'
, 'consurf_scaled'
, 'consurf_colour'
, 'consurf_colour_rev'
, 'consurf_ci_upper'
, 'consurf_ci_lower'
, 'consurf_ci_colour'
, 'consurf_msa_data'
, 'consurf_aa_variety']]
#=======================
# SNAP2
#=======================
snap2_df.shape
#----------------------
# rename colums
#----------------------
geneL_snap2 = ['alr', 'katg', 'rpob']
if gene.lower() in geneL_snap2:
print('\nReading SNAP2 for gene:', gene.lower()
, '\nOffset column also being read'
, '\nRenaming columns...'
, '\nColumn mutationinformation exists. Renaming SNAP2 column variant --> mutationinformation')
snap2_df = snap2_df.rename(columns = {'mutationinformation': 'mutationinformation'
, 'Variant' : 'mutationinformation_snap2'
, 'Predicted Effect' : 'snap2_outcome'
, 'Score' : 'snap2_score'
, 'Expected Accuracy': 'snap2_accuracy_pc'})
else:
print('\nReading SNAP2 for gene:', gene.lower()
, '\nNo offset column for SNAP2'
, '\nRenaming columns...'
, '\nRenaming SNAP2 column variant --> mutationinformation')
snap2_df = snap2_df.rename(columns = {'Variant' : 'mutationinformation'
, 'Predicted Effect' : 'snap2_outcome'
, 'Score' : 'snap2_score'
, 'Expected Accuracy': 'snap2_accuracy_pc'})
snap2_df.columns
snap2_df.head()
snap2_df.dtypes
snap2_df['snap2_accuracy_pc'] = snap2_df['snap2_accuracy_pc'].str.replace('%','')
snap2_df['snap2_accuracy_pc'] = snap2_df['snap2_accuracy_pc'].astype(int)
#-------------------------
# scale snap2 values
#-------------------------
# Rescale values in snap2_score col b/w -1 and 1 so negative numbers
# stay neg and pos numbers stay positive
snap2_min = snap2_df['snap2_score'].min()
snap2_max = snap2_df['snap2_score'].max()
snap2_min
snap2_max
# quick check
len(snap2_df.loc[snap2_df['snap2_score'] >= 0])
len(snap2_df.loc[snap2_df['snap2_score'] < 0])
snap2_scale = lambda x : x/abs(snap2_min) if x < 0 else (x/snap2_max if x >= 0 else 'failed')
snap2_df['snap2_scaled'] = snap2_df['snap2_score'].apply(snap2_scale)
print('\nRaw snap2 scores:\n', snap2_df['snap2_score']
, '\n---------------------------------------------------------------'
, '\nScaled snap2 scores:\n', snap2_df['snap2_scaled'])
# additional check added
ssmi = snap2_df['snap2_scaled'].min()
ssma = snap2_df['snap2_scaled'].max()
sn = snap2_df[snap2_df['snap2_score']>=0].count()
snap2_pos = sn.get(key = 'snap2_score')
sn2 = snap2_df[snap2_df['snap2_scaled']>=0].count()
snap2_pos2 = sn2.get(key = 'snap2_scaled')
if snap2_pos == snap2_pos2 and csmi == -1 and csma == 1:
print('\nPASS: Snap2 values scaled correctly b/w -1 and 1')
else:
print('\nFAIL: snap2 values scaled numbers MISmatch'
, '\nExpected number:', snap2_pos
, '\nGot:', snap2_pos2
, '\n======================================================')
#---------------------------
# select columns
# (and also determine order)
#---------------------------
snap2_df.dtypes
snap2_df.columns
geneL_snap2 = ['alr', 'katg', 'rpob']
if gene.lower() in geneL_snap2:
print('\nSelecting cols SNAP2 for gene:', gene.lower())
snap2_df_f = snap2_df[['mutationinformation'
, 'mutationinformation_snap2'
, 'snap2_score'
, 'snap2_scaled'
, 'snap2_accuracy_pc'
, 'snap2_outcome']]
else:
print('\nSelecting cols SNAP2 for gene:', gene.lower())
snap2_df_f = snap2_df[['mutationinformation'
, 'snap2_score'
, 'snap2_scaled'
, 'snap2_accuracy_pc'
, 'snap2_outcome']]
#%%============================================================================
# Now merges begin
@ -499,7 +752,9 @@ merging_cols_m2 = detect_common_cols(dssp_df, kd_df)
dssp_kd_dfs = pd.merge(dssp_df
, kd_df
, on = merging_cols_m2
, how = "outer")
#, how = "outer")
, how = "inner")
print('\n\nResult of third merge:', dssp_kd_dfs.shape
, '\n===================================================================')
@ -521,6 +776,26 @@ print('\n\nResult of Third merge:', dssp_kd_rd_dfs.shape
, '\n===================================================================')
dssp_kd_rd_dfs[merging_cols_m3].apply(len)
dssp_kd_rd_dfs[merging_cols_m3].apply(len) == len(dssp_kd_rd_dfs)
#%%============================================================================
print('==================================='
, '\nFourth merge*: fourth merge + consurf_df'
, '\dssp_kd_rd_dfs + consurf_df'
, '\n===================================')
#dssp_kd_rd_dfs = combine_dfs_with_checks(dssp_kd_dfs, rd_df, my_join = "outer")
merging_cols_m3_v2 = detect_common_cols(dssp_kd_rd_dfs, consurf_df)
dssp_kd_rd_con_dfs = pd.merge(dssp_kd_rd_dfs
, consurf_df
, on = merging_cols_m3_v2
, how = "outer")
ncols_m3_v2 = len(dssp_kd_rd_con_dfs.columns)
print('\n\nResult of fourth merge*:', dssp_kd_rd_con_dfs.shape
, '\n===================================================================')
dssp_kd_rd_con_dfs[merging_cols_m3_v2].apply(len)
dssp_kd_rd_con_dfs[merging_cols_m3_v2].apply(len) == len(dssp_kd_rd_con_dfs)
#%%============================================================================
print('======================================='
, '\nFifth merge: Second merge + fourth merge'