handled rpob 5uhc position offset in mcsm_ppi2
This commit is contained in:
parent
46e2c93885
commit
00b84ccb1c
30 changed files with 395 additions and 63 deletions
|
@ -53,7 +53,7 @@ homedir = os.path.expanduser('~')
|
|||
|
||||
# set working dir
|
||||
os.getcwd()
|
||||
#os.chdir(homedir + '/git/LSHTM_analysis/scripts')
|
||||
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
|
||||
os.getcwd()
|
||||
|
||||
# FIXME: local imports
|
||||
|
@ -170,6 +170,18 @@ infilename_mcsm_f_snps = gene.lower() + '_mcsm_formatted_snps.csv'
|
|||
infile_mcsm_f_snps = outdir + infilename_mcsm_f_snps
|
||||
mcsm_f_snps = pd.read_csv(infile_mcsm_f_snps, sep = ',', names = ['mutationinformation'], header = None)
|
||||
|
||||
# more output added
|
||||
## consurf [change colnames]
|
||||
|
||||
infilename_consurf = gene.lower() + '_consurf_grades_f.csv'
|
||||
infile_consurf = outdir + 'consurf/'+ infilename_consurf
|
||||
consurf_df = pd.read_csv(infile_consurf, sep = ',')
|
||||
|
||||
## SNAP2 [add normalised score]
|
||||
infilename_snap2 = gene.lower() + '_snap2_output.csv'
|
||||
infile_snap2 = outdir + 'snap2/'+ infilename_snap2
|
||||
snap2_df = pd.read_csv(infile_snap2, sep = ',')
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
# ONLY:for gene pnca and gid: End logic should pick this up!
|
||||
geneL_na = ['gid', 'rpob']
|
||||
|
@ -196,7 +208,7 @@ if gene.lower() in geneL_dy:
|
|||
# infile_mcsm_na = outdir + 'mcsm_na_results/' + infilename_mcsm_na
|
||||
# mcsm_na_df = pd.read_csv(infile_mcsm_na, sep = ',')
|
||||
|
||||
# ONLY:for gene embb and alr: End logic should pick this up!
|
||||
# ONLY:for gene embb and alr and katg: End logic should pick this up!
|
||||
geneL_ppi2 = ['embb', 'alr']
|
||||
#if gene.lower() == "embb" or "alr":
|
||||
if gene.lower() in geneL_ppi2:
|
||||
|
@ -381,6 +393,247 @@ else:
|
|||
if deepddg_df['deepddg_scaled'].min() == -1 and deepddg_df['deepddg_scaled'].max() == 1:
|
||||
print('\nPASS: Deepddg data is scaled between -1 and 1',
|
||||
'\nproceeding with merge')
|
||||
|
||||
#=======================
|
||||
# Consurf
|
||||
#=======================
|
||||
consurf_df.shape
|
||||
|
||||
# drop row 0: as it contains no value but hangover text
|
||||
consurf_df = consurf_df.drop(index=0)
|
||||
|
||||
#----------------------
|
||||
# rename colums
|
||||
#----------------------
|
||||
consurf_df.columns
|
||||
print('\nRenaming cols and assigning pretty column names')
|
||||
|
||||
geneL_consurf = ['alr', 'katg', 'rpob']
|
||||
|
||||
if gene.lower() in geneL_consurf:
|
||||
consurf_df = consurf_df.rename(columns={'POS' : 'position_consurf'})
|
||||
#---------------------------
|
||||
# Specify the offset
|
||||
#---------------------------
|
||||
print('\nAdding offset value for gene:', gene.lower())
|
||||
|
||||
if gene.lower() == 'alr':
|
||||
offset_val = 34
|
||||
|
||||
print('\nUsing offset val:', offset_val)
|
||||
if gene.lower() == 'katg':
|
||||
offset_val = 23
|
||||
print('\nUsing offset val:', offset_val)
|
||||
|
||||
if gene.lower() == 'rpob':
|
||||
offset_val = 28
|
||||
print('\nUsing offset val:', offset_val)
|
||||
|
||||
consurf_df['position'] = consurf_df['position_consurf'] + offset_val
|
||||
|
||||
else:
|
||||
consurf_df = consurf_df.rename(columns={'POS' : 'position'})
|
||||
|
||||
consurf_df = consurf_df.rename(columns={'SEQ' : 'wild_type'
|
||||
, '3LATOM': 'wt_3upper'
|
||||
, 'SCORE' : 'consurf_score'
|
||||
, 'COLOR' : 'consurf_colour_str'
|
||||
, 'CONFIDENCEINTERVAL' : 'consurf_ci'
|
||||
, 'CONFIDENCEINTERVALCOLORS' : 'consurf_ci_colour'
|
||||
, 'MSADATA' : 'consurf_msa_data'
|
||||
, 'RESIDUEVARIETY' : 'consurf_aa_variety'})
|
||||
# quick check
|
||||
if len(consurf_df) == len(rd_df):
|
||||
print('\nPASS: length of consurf df is as expected'
|
||||
, '\nProceeding to format consurf df')
|
||||
else:
|
||||
print('\nFAIL: length mismatch'
|
||||
, '\nExpected nrows:', len(rd_df)
|
||||
, '\nGot:', len(consurf_df))
|
||||
|
||||
consurf_df.dtypes
|
||||
consurf_df['consurf_score'] = consurf_df['consurf_score'].astype(float)
|
||||
|
||||
consurf_df['consurf_colour'] = consurf_df['consurf_colour_str'].str.extract(r'(\d).*')
|
||||
consurf_df['consurf_colour'] = consurf_df['consurf_colour'].astype(int)
|
||||
|
||||
consurf_df['consurf_colour_rev'] = consurf_df['consurf_colour_str'].str.replace(r'.\*','0')
|
||||
consurf_df['consurf_colour_rev'] = consurf_df['consurf_colour_rev'].astype(int)
|
||||
|
||||
consurf_df['consurf_ci_upper'] = consurf_df['consurf_ci'].str.extract(r'(.*):')
|
||||
consurf_df['consurf_ci_upper'] = consurf_df['consurf_ci_upper'].astype(float)
|
||||
|
||||
consurf_df['consurf_ci_lower'] = consurf_df['consurf_ci'].str.extract(r':(.*)')
|
||||
consurf_df['consurf_ci_lower'] = consurf_df['consurf_ci_lower'].astype(float)
|
||||
|
||||
#consurf_df['wt_3upper_f'] = consurf_df['wt_3upper'].str.extract(r'^\w{3}(\d+.*)')
|
||||
#consurf_df['wt_3upper_f']
|
||||
consurf_df['wt_3upper'] = consurf_df['wt_3upper'].str.replace(r'(\d+:.*)', '')
|
||||
|
||||
consurf_df['chain'] = consurf_df['wt_3upper'].str.extract(r':(.*)')
|
||||
|
||||
#-------------------------
|
||||
# scale consurf values
|
||||
#-------------------------
|
||||
# Rescale values in consurf_score col b/w -1 and 1 so negative numbers
|
||||
# stay neg and pos numbers stay positive
|
||||
consurf_min = consurf_df['consurf_score'].min()
|
||||
consurf_max = consurf_df['consurf_score'].max()
|
||||
consurf_min
|
||||
consurf_max
|
||||
|
||||
# quick check
|
||||
len(consurf_df.loc[consurf_df['consurf_score'] >= 0])
|
||||
len(consurf_df.loc[consurf_df['consurf_score'] < 0])
|
||||
|
||||
consurf_scale = lambda x : x/abs(consurf_min) if x < 0 else (x/consurf_max if x >= 0 else 'failed')
|
||||
|
||||
consurf_df['consurf_scaled'] = consurf_df['consurf_score'].apply(consurf_scale)
|
||||
print('\nRaw consurf scores:\n', consurf_df['consurf_score']
|
||||
, '\n---------------------------------------------------------------'
|
||||
, '\nScaled consurf scores:\n', consurf_df['consurf_scaled'])
|
||||
|
||||
# additional check added
|
||||
csmi = consurf_df['consurf_scaled'].min()
|
||||
csma = consurf_df['consurf_scaled'].max()
|
||||
|
||||
c = consurf_df[consurf_df['consurf_score']>=0].count()
|
||||
consurf_pos = c.get(key = 'consurf_score')
|
||||
|
||||
c2 = consurf_df[consurf_df['consurf_scaled']>=0].count()
|
||||
consurf_pos2 = c2.get(key = 'consurf_scaled')
|
||||
|
||||
if consurf_pos == consurf_pos2 and csmi == -1 and csma == 1:
|
||||
print('\nPASS: Consurf values scaled correctly b/w -1 and 1')
|
||||
else:
|
||||
print('\nFAIL: Consurf values scaled numbers MISmatch'
|
||||
, '\nExpected number:', consurf_pos
|
||||
, '\nGot:', consurf_pos2
|
||||
, '\n======================================================')
|
||||
|
||||
consurf_df.dtypes
|
||||
consurf_df.columns
|
||||
|
||||
#---------------------------
|
||||
# select columns
|
||||
# (and also determine order)
|
||||
#---------------------------
|
||||
consurf_df_f = consurf_df[['position'
|
||||
, 'wild_type'
|
||||
, 'chain'
|
||||
, 'wt_3upper'
|
||||
, 'consurf_score'
|
||||
, 'consurf_scaled'
|
||||
, 'consurf_colour'
|
||||
, 'consurf_colour_rev'
|
||||
, 'consurf_ci_upper'
|
||||
, 'consurf_ci_lower'
|
||||
, 'consurf_ci_colour'
|
||||
, 'consurf_msa_data'
|
||||
, 'consurf_aa_variety']]
|
||||
|
||||
#=======================
|
||||
# SNAP2
|
||||
#=======================
|
||||
snap2_df.shape
|
||||
|
||||
#----------------------
|
||||
# rename colums
|
||||
#----------------------
|
||||
geneL_snap2 = ['alr', 'katg', 'rpob']
|
||||
|
||||
if gene.lower() in geneL_snap2:
|
||||
print('\nReading SNAP2 for gene:', gene.lower()
|
||||
, '\nOffset column also being read'
|
||||
, '\nRenaming columns...'
|
||||
, '\nColumn mutationinformation exists. Renaming SNAP2 column variant --> mutationinformation')
|
||||
|
||||
snap2_df = snap2_df.rename(columns = {'mutationinformation': 'mutationinformation'
|
||||
, 'Variant' : 'mutationinformation_snap2'
|
||||
, 'Predicted Effect' : 'snap2_outcome'
|
||||
, 'Score' : 'snap2_score'
|
||||
, 'Expected Accuracy': 'snap2_accuracy_pc'})
|
||||
else:
|
||||
print('\nReading SNAP2 for gene:', gene.lower()
|
||||
, '\nNo offset column for SNAP2'
|
||||
, '\nRenaming columns...'
|
||||
, '\nRenaming SNAP2 column variant --> mutationinformation')
|
||||
|
||||
snap2_df = snap2_df.rename(columns = {'Variant' : 'mutationinformation'
|
||||
, 'Predicted Effect' : 'snap2_outcome'
|
||||
, 'Score' : 'snap2_score'
|
||||
, 'Expected Accuracy': 'snap2_accuracy_pc'})
|
||||
|
||||
snap2_df.columns
|
||||
snap2_df.head()
|
||||
snap2_df.dtypes
|
||||
|
||||
snap2_df['snap2_accuracy_pc'] = snap2_df['snap2_accuracy_pc'].str.replace('%','')
|
||||
snap2_df['snap2_accuracy_pc'] = snap2_df['snap2_accuracy_pc'].astype(int)
|
||||
|
||||
#-------------------------
|
||||
# scale snap2 values
|
||||
#-------------------------
|
||||
# Rescale values in snap2_score col b/w -1 and 1 so negative numbers
|
||||
# stay neg and pos numbers stay positive
|
||||
snap2_min = snap2_df['snap2_score'].min()
|
||||
snap2_max = snap2_df['snap2_score'].max()
|
||||
snap2_min
|
||||
snap2_max
|
||||
|
||||
# quick check
|
||||
len(snap2_df.loc[snap2_df['snap2_score'] >= 0])
|
||||
len(snap2_df.loc[snap2_df['snap2_score'] < 0])
|
||||
|
||||
snap2_scale = lambda x : x/abs(snap2_min) if x < 0 else (x/snap2_max if x >= 0 else 'failed')
|
||||
|
||||
snap2_df['snap2_scaled'] = snap2_df['snap2_score'].apply(snap2_scale)
|
||||
print('\nRaw snap2 scores:\n', snap2_df['snap2_score']
|
||||
, '\n---------------------------------------------------------------'
|
||||
, '\nScaled snap2 scores:\n', snap2_df['snap2_scaled'])
|
||||
|
||||
# additional check added
|
||||
ssmi = snap2_df['snap2_scaled'].min()
|
||||
ssma = snap2_df['snap2_scaled'].max()
|
||||
|
||||
sn = snap2_df[snap2_df['snap2_score']>=0].count()
|
||||
snap2_pos = sn.get(key = 'snap2_score')
|
||||
|
||||
sn2 = snap2_df[snap2_df['snap2_scaled']>=0].count()
|
||||
snap2_pos2 = sn2.get(key = 'snap2_scaled')
|
||||
|
||||
if snap2_pos == snap2_pos2 and csmi == -1 and csma == 1:
|
||||
print('\nPASS: Snap2 values scaled correctly b/w -1 and 1')
|
||||
else:
|
||||
print('\nFAIL: snap2 values scaled numbers MISmatch'
|
||||
, '\nExpected number:', snap2_pos
|
||||
, '\nGot:', snap2_pos2
|
||||
, '\n======================================================')
|
||||
|
||||
#---------------------------
|
||||
# select columns
|
||||
# (and also determine order)
|
||||
#---------------------------
|
||||
snap2_df.dtypes
|
||||
snap2_df.columns
|
||||
|
||||
geneL_snap2 = ['alr', 'katg', 'rpob']
|
||||
|
||||
if gene.lower() in geneL_snap2:
|
||||
print('\nSelecting cols SNAP2 for gene:', gene.lower())
|
||||
snap2_df_f = snap2_df[['mutationinformation'
|
||||
, 'mutationinformation_snap2'
|
||||
, 'snap2_score'
|
||||
, 'snap2_scaled'
|
||||
, 'snap2_accuracy_pc'
|
||||
, 'snap2_outcome']]
|
||||
else:
|
||||
print('\nSelecting cols SNAP2 for gene:', gene.lower())
|
||||
snap2_df_f = snap2_df[['mutationinformation'
|
||||
, 'snap2_score'
|
||||
, 'snap2_scaled'
|
||||
, 'snap2_accuracy_pc'
|
||||
, 'snap2_outcome']]
|
||||
|
||||
#%%============================================================================
|
||||
# Now merges begin
|
||||
|
@ -499,7 +752,9 @@ merging_cols_m2 = detect_common_cols(dssp_df, kd_df)
|
|||
dssp_kd_dfs = pd.merge(dssp_df
|
||||
, kd_df
|
||||
, on = merging_cols_m2
|
||||
, how = "outer")
|
||||
#, how = "outer")
|
||||
, how = "inner")
|
||||
|
||||
|
||||
print('\n\nResult of third merge:', dssp_kd_dfs.shape
|
||||
, '\n===================================================================')
|
||||
|
@ -521,6 +776,26 @@ print('\n\nResult of Third merge:', dssp_kd_rd_dfs.shape
|
|||
, '\n===================================================================')
|
||||
dssp_kd_rd_dfs[merging_cols_m3].apply(len)
|
||||
dssp_kd_rd_dfs[merging_cols_m3].apply(len) == len(dssp_kd_rd_dfs)
|
||||
|
||||
#%%============================================================================
|
||||
print('==================================='
|
||||
, '\nFourth merge*: fourth merge + consurf_df'
|
||||
, '\dssp_kd_rd_dfs + consurf_df'
|
||||
, '\n===================================')
|
||||
#dssp_kd_rd_dfs = combine_dfs_with_checks(dssp_kd_dfs, rd_df, my_join = "outer")
|
||||
merging_cols_m3_v2 = detect_common_cols(dssp_kd_rd_dfs, consurf_df)
|
||||
dssp_kd_rd_con_dfs = pd.merge(dssp_kd_rd_dfs
|
||||
, consurf_df
|
||||
, on = merging_cols_m3_v2
|
||||
, how = "outer")
|
||||
|
||||
ncols_m3_v2 = len(dssp_kd_rd_con_dfs.columns)
|
||||
|
||||
print('\n\nResult of fourth merge*:', dssp_kd_rd_con_dfs.shape
|
||||
, '\n===================================================================')
|
||||
dssp_kd_rd_con_dfs[merging_cols_m3_v2].apply(len)
|
||||
dssp_kd_rd_con_dfs[merging_cols_m3_v2].apply(len) == len(dssp_kd_rd_con_dfs)
|
||||
|
||||
#%%============================================================================
|
||||
print('======================================='
|
||||
, '\nFifth merge: Second merge + fourth merge'
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue