handled rpob 5uhc position offset in mcsm_ppi2

This commit is contained in:
Tanushree Tunstall 2022-01-04 10:45:29 +00:00
parent 46e2c93885
commit 00b84ccb1c
30 changed files with 395 additions and 63 deletions

View file

@ -53,7 +53,7 @@ homedir = os.path.expanduser('~')
# set working dir
os.getcwd()
#os.chdir(homedir + '/git/LSHTM_analysis/scripts')
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
os.getcwd()
# FIXME: local imports
@ -170,6 +170,18 @@ infilename_mcsm_f_snps = gene.lower() + '_mcsm_formatted_snps.csv'
infile_mcsm_f_snps = outdir + infilename_mcsm_f_snps
mcsm_f_snps = pd.read_csv(infile_mcsm_f_snps, sep = ',', names = ['mutationinformation'], header = None)
# more output added
## consurf [change colnames]
infilename_consurf = gene.lower() + '_consurf_grades_f.csv'
infile_consurf = outdir + 'consurf/'+ infilename_consurf
consurf_df = pd.read_csv(infile_consurf, sep = ',')
## SNAP2 [add normalised score]
infilename_snap2 = gene.lower() + '_snap2_output.csv'
infile_snap2 = outdir + 'snap2/'+ infilename_snap2
snap2_df = pd.read_csv(infile_snap2, sep = ',')
#------------------------------------------------------------------------------
# ONLY:for gene pnca and gid: End logic should pick this up!
geneL_na = ['gid', 'rpob']
@ -196,7 +208,7 @@ if gene.lower() in geneL_dy:
# infile_mcsm_na = outdir + 'mcsm_na_results/' + infilename_mcsm_na
# mcsm_na_df = pd.read_csv(infile_mcsm_na, sep = ',')
# ONLY:for gene embb and alr: End logic should pick this up!
# ONLY:for gene embb and alr and katg: End logic should pick this up!
geneL_ppi2 = ['embb', 'alr']
#if gene.lower() == "embb" or "alr":
if gene.lower() in geneL_ppi2:
@ -381,6 +393,247 @@ else:
if deepddg_df['deepddg_scaled'].min() == -1 and deepddg_df['deepddg_scaled'].max() == 1:
print('\nPASS: Deepddg data is scaled between -1 and 1',
'\nproceeding with merge')
#=======================
# Consurf
#=======================
consurf_df.shape
# drop row 0: as it contains no value but hangover text
consurf_df = consurf_df.drop(index=0)
#----------------------
# rename colums
#----------------------
consurf_df.columns
print('\nRenaming cols and assigning pretty column names')
geneL_consurf = ['alr', 'katg', 'rpob']
if gene.lower() in geneL_consurf:
consurf_df = consurf_df.rename(columns={'POS' : 'position_consurf'})
#---------------------------
# Specify the offset
#---------------------------
print('\nAdding offset value for gene:', gene.lower())
if gene.lower() == 'alr':
offset_val = 34
print('\nUsing offset val:', offset_val)
if gene.lower() == 'katg':
offset_val = 23
print('\nUsing offset val:', offset_val)
if gene.lower() == 'rpob':
offset_val = 28
print('\nUsing offset val:', offset_val)
consurf_df['position'] = consurf_df['position_consurf'] + offset_val
else:
consurf_df = consurf_df.rename(columns={'POS' : 'position'})
consurf_df = consurf_df.rename(columns={'SEQ' : 'wild_type'
, '3LATOM': 'wt_3upper'
, 'SCORE' : 'consurf_score'
, 'COLOR' : 'consurf_colour_str'
, 'CONFIDENCEINTERVAL' : 'consurf_ci'
, 'CONFIDENCEINTERVALCOLORS' : 'consurf_ci_colour'
, 'MSADATA' : 'consurf_msa_data'
, 'RESIDUEVARIETY' : 'consurf_aa_variety'})
# quick check
if len(consurf_df) == len(rd_df):
print('\nPASS: length of consurf df is as expected'
, '\nProceeding to format consurf df')
else:
print('\nFAIL: length mismatch'
, '\nExpected nrows:', len(rd_df)
, '\nGot:', len(consurf_df))
consurf_df.dtypes
consurf_df['consurf_score'] = consurf_df['consurf_score'].astype(float)
consurf_df['consurf_colour'] = consurf_df['consurf_colour_str'].str.extract(r'(\d).*')
consurf_df['consurf_colour'] = consurf_df['consurf_colour'].astype(int)
consurf_df['consurf_colour_rev'] = consurf_df['consurf_colour_str'].str.replace(r'.\*','0')
consurf_df['consurf_colour_rev'] = consurf_df['consurf_colour_rev'].astype(int)
consurf_df['consurf_ci_upper'] = consurf_df['consurf_ci'].str.extract(r'(.*):')
consurf_df['consurf_ci_upper'] = consurf_df['consurf_ci_upper'].astype(float)
consurf_df['consurf_ci_lower'] = consurf_df['consurf_ci'].str.extract(r':(.*)')
consurf_df['consurf_ci_lower'] = consurf_df['consurf_ci_lower'].astype(float)
#consurf_df['wt_3upper_f'] = consurf_df['wt_3upper'].str.extract(r'^\w{3}(\d+.*)')
#consurf_df['wt_3upper_f']
consurf_df['wt_3upper'] = consurf_df['wt_3upper'].str.replace(r'(\d+:.*)', '')
consurf_df['chain'] = consurf_df['wt_3upper'].str.extract(r':(.*)')
#-------------------------
# scale consurf values
#-------------------------
# Rescale values in consurf_score col b/w -1 and 1 so negative numbers
# stay neg and pos numbers stay positive
consurf_min = consurf_df['consurf_score'].min()
consurf_max = consurf_df['consurf_score'].max()
consurf_min
consurf_max
# quick check
len(consurf_df.loc[consurf_df['consurf_score'] >= 0])
len(consurf_df.loc[consurf_df['consurf_score'] < 0])
consurf_scale = lambda x : x/abs(consurf_min) if x < 0 else (x/consurf_max if x >= 0 else 'failed')
consurf_df['consurf_scaled'] = consurf_df['consurf_score'].apply(consurf_scale)
print('\nRaw consurf scores:\n', consurf_df['consurf_score']
, '\n---------------------------------------------------------------'
, '\nScaled consurf scores:\n', consurf_df['consurf_scaled'])
# additional check added
csmi = consurf_df['consurf_scaled'].min()
csma = consurf_df['consurf_scaled'].max()
c = consurf_df[consurf_df['consurf_score']>=0].count()
consurf_pos = c.get(key = 'consurf_score')
c2 = consurf_df[consurf_df['consurf_scaled']>=0].count()
consurf_pos2 = c2.get(key = 'consurf_scaled')
if consurf_pos == consurf_pos2 and csmi == -1 and csma == 1:
print('\nPASS: Consurf values scaled correctly b/w -1 and 1')
else:
print('\nFAIL: Consurf values scaled numbers MISmatch'
, '\nExpected number:', consurf_pos
, '\nGot:', consurf_pos2
, '\n======================================================')
consurf_df.dtypes
consurf_df.columns
#---------------------------
# select columns
# (and also determine order)
#---------------------------
consurf_df_f = consurf_df[['position'
, 'wild_type'
, 'chain'
, 'wt_3upper'
, 'consurf_score'
, 'consurf_scaled'
, 'consurf_colour'
, 'consurf_colour_rev'
, 'consurf_ci_upper'
, 'consurf_ci_lower'
, 'consurf_ci_colour'
, 'consurf_msa_data'
, 'consurf_aa_variety']]
#=======================
# SNAP2
#=======================
snap2_df.shape
#----------------------
# rename colums
#----------------------
geneL_snap2 = ['alr', 'katg', 'rpob']
if gene.lower() in geneL_snap2:
print('\nReading SNAP2 for gene:', gene.lower()
, '\nOffset column also being read'
, '\nRenaming columns...'
, '\nColumn mutationinformation exists. Renaming SNAP2 column variant --> mutationinformation')
snap2_df = snap2_df.rename(columns = {'mutationinformation': 'mutationinformation'
, 'Variant' : 'mutationinformation_snap2'
, 'Predicted Effect' : 'snap2_outcome'
, 'Score' : 'snap2_score'
, 'Expected Accuracy': 'snap2_accuracy_pc'})
else:
print('\nReading SNAP2 for gene:', gene.lower()
, '\nNo offset column for SNAP2'
, '\nRenaming columns...'
, '\nRenaming SNAP2 column variant --> mutationinformation')
snap2_df = snap2_df.rename(columns = {'Variant' : 'mutationinformation'
, 'Predicted Effect' : 'snap2_outcome'
, 'Score' : 'snap2_score'
, 'Expected Accuracy': 'snap2_accuracy_pc'})
snap2_df.columns
snap2_df.head()
snap2_df.dtypes
snap2_df['snap2_accuracy_pc'] = snap2_df['snap2_accuracy_pc'].str.replace('%','')
snap2_df['snap2_accuracy_pc'] = snap2_df['snap2_accuracy_pc'].astype(int)
#-------------------------
# scale snap2 values
#-------------------------
# Rescale values in snap2_score col b/w -1 and 1 so negative numbers
# stay neg and pos numbers stay positive
snap2_min = snap2_df['snap2_score'].min()
snap2_max = snap2_df['snap2_score'].max()
snap2_min
snap2_max
# quick check
len(snap2_df.loc[snap2_df['snap2_score'] >= 0])
len(snap2_df.loc[snap2_df['snap2_score'] < 0])
snap2_scale = lambda x : x/abs(snap2_min) if x < 0 else (x/snap2_max if x >= 0 else 'failed')
snap2_df['snap2_scaled'] = snap2_df['snap2_score'].apply(snap2_scale)
print('\nRaw snap2 scores:\n', snap2_df['snap2_score']
, '\n---------------------------------------------------------------'
, '\nScaled snap2 scores:\n', snap2_df['snap2_scaled'])
# additional check added
ssmi = snap2_df['snap2_scaled'].min()
ssma = snap2_df['snap2_scaled'].max()
sn = snap2_df[snap2_df['snap2_score']>=0].count()
snap2_pos = sn.get(key = 'snap2_score')
sn2 = snap2_df[snap2_df['snap2_scaled']>=0].count()
snap2_pos2 = sn2.get(key = 'snap2_scaled')
if snap2_pos == snap2_pos2 and csmi == -1 and csma == 1:
print('\nPASS: Snap2 values scaled correctly b/w -1 and 1')
else:
print('\nFAIL: snap2 values scaled numbers MISmatch'
, '\nExpected number:', snap2_pos
, '\nGot:', snap2_pos2
, '\n======================================================')
#---------------------------
# select columns
# (and also determine order)
#---------------------------
snap2_df.dtypes
snap2_df.columns
geneL_snap2 = ['alr', 'katg', 'rpob']
if gene.lower() in geneL_snap2:
print('\nSelecting cols SNAP2 for gene:', gene.lower())
snap2_df_f = snap2_df[['mutationinformation'
, 'mutationinformation_snap2'
, 'snap2_score'
, 'snap2_scaled'
, 'snap2_accuracy_pc'
, 'snap2_outcome']]
else:
print('\nSelecting cols SNAP2 for gene:', gene.lower())
snap2_df_f = snap2_df[['mutationinformation'
, 'snap2_score'
, 'snap2_scaled'
, 'snap2_accuracy_pc'
, 'snap2_outcome']]
#%%============================================================================
# Now merges begin
@ -499,7 +752,9 @@ merging_cols_m2 = detect_common_cols(dssp_df, kd_df)
dssp_kd_dfs = pd.merge(dssp_df
, kd_df
, on = merging_cols_m2
, how = "outer")
#, how = "outer")
, how = "inner")
print('\n\nResult of third merge:', dssp_kd_dfs.shape
, '\n===================================================================')
@ -521,6 +776,26 @@ print('\n\nResult of Third merge:', dssp_kd_rd_dfs.shape
, '\n===================================================================')
dssp_kd_rd_dfs[merging_cols_m3].apply(len)
dssp_kd_rd_dfs[merging_cols_m3].apply(len) == len(dssp_kd_rd_dfs)
#%%============================================================================
print('==================================='
, '\nFourth merge*: fourth merge + consurf_df'
, '\dssp_kd_rd_dfs + consurf_df'
, '\n===================================')
#dssp_kd_rd_dfs = combine_dfs_with_checks(dssp_kd_dfs, rd_df, my_join = "outer")
merging_cols_m3_v2 = detect_common_cols(dssp_kd_rd_dfs, consurf_df)
dssp_kd_rd_con_dfs = pd.merge(dssp_kd_rd_dfs
, consurf_df
, on = merging_cols_m3_v2
, how = "outer")
ncols_m3_v2 = len(dssp_kd_rd_con_dfs.columns)
print('\n\nResult of fourth merge*:', dssp_kd_rd_con_dfs.shape
, '\n===================================================================')
dssp_kd_rd_con_dfs[merging_cols_m3_v2].apply(len)
dssp_kd_rd_con_dfs[merging_cols_m3_v2].apply(len) == len(dssp_kd_rd_con_dfs)
#%%============================================================================
print('======================================='
, '\nFifth merge: Second merge + fourth merge'

View file

@ -75,15 +75,14 @@ args = arg_parser.parse_args()
drug = args.drug
gene = args.gene
#drug = 'pyrazinamide'
#gene = 'pncA'
gene_match = gene + '_p.'
print('mut pattern for gene', gene, ':', gene_match)
nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}'
print('nsSNP for gene', gene, ':', nssnp_match)
nssnp_match2 = re.compile(nssnp_match)
wt_regex = gene_match.lower()+'([A-Za-z]{3})'
print('wt regex:', wt_regex)
@ -219,20 +218,21 @@ meta_gene_epi = meta_gene_multi.loc[(meta_gene_multi['dr_mult_snp_count']>1) | (
#%% TEST
# formatting, replace !nssnp_match with nothing
foo1 = 'pncA_p.Thr47Pro;pncA_p.Thr61Pro;rpsA_c.XX'
foo2 = 'pncA_Chromosome:g.2288693_2289280del; WT; pncA_p.Thr61Ala'
#foo1 = 'pncA_p.Thr47Pro;pncA_p.Thr61Pro;rpsA_c.XX'
#foo2 = 'pncA_Chromosome:g.2288693_2289280del; WT; pncA_p.Thr61Ala'
foo1_s = foo1.split(';')
foo1_s
nssnp_match2 = re.compile('(pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})')
arse=list(filter(nssnp_match2.match, foo1_s))
arse
#foo1_s = foo1.split(';')
#foo1_s
#nssnp_match2 = re.compile('(pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})')
#arse=list(filter(nssnp_match2.match, foo1_s))
#arse
#foo1_s2 = ';'.join(arse)
#foo1_s2
foo1_s2 = ';'.join(arse)
foo1_s2
#%%
nssnp_match2 = re.compile('(pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})')
#nssnp_match2 = re.compile('(pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})')
# dr_muts_col
dr_clean_col = dr_muts_col + '_clean'
@ -248,6 +248,7 @@ for i, v in enumerate(meta_gene_epi[dr_muts_col]):
dr2_s = v.split(';')
print(dr2_s)
dr2_sf = list(filter(nssnp_match2.match, dr2_s))
#dr2_sf = list(filter(nssnp_match.match, dr2_s))
print(dr2_sf)
dr2_sf2 = ';'.join(dr2_sf)
meta_gene_epi[dr_clean_col].iloc[i] = dr2_sf2
@ -262,13 +263,13 @@ meta_gene_epi[other_clean_col] = ''
for i, v in enumerate(meta_gene_epi[other_muts_col]):
#print(i, v)
print('======================================================')
print(i)
print(v)
#print('======================================================')
#print(i)
#print(v)
other2_s = v.split(';')
print(other2_s)
#print(other2_s)
other2_sf = list(filter(nssnp_match2.match, other2_s))
print(other2_sf)
#print(other2_sf)
other2_sf2 = ';'.join(other2_sf)
meta_gene_epi[other_clean_col].iloc[i] = other2_sf2
@ -281,7 +282,8 @@ meta_gene_epi_f = meta_gene_epi[['id', 'sample'
, 'dr_mult_snp_count'
, other_muts_col, other_clean_col
, 'other_mult_snp_count']]
meta_gene_epi_f.columns
#print(meta_gene_epi_f.columns)
print(meta_gene_epi_f)
cols_to_output = ['id', 'sample'
, dr_clean_col
@ -293,7 +295,6 @@ cols_to_output = ['id', 'sample'
meta_gene_epi_f2 = meta_gene_epi_f[cols_to_output]
#%%
# formatting, replace !nssnp_match with nothing
#nssnp_neg_match = '(?!pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})'

View file

@ -92,7 +92,7 @@ else:
infile_fasta = indir + '/' + in_filename_fasta
print('Input fasta file:', infile_fasta
, '\n============================================================')
#=======
# output
#=======

0
scripts/plotting/basic_barplots_combined.R Executable file → Normal file
View file

0
scripts/plotting/corr_adjusted_PS_LIG.R Executable file → Normal file
View file

0
scripts/plotting/dirs.R Executable file → Normal file
View file

0
scripts/plotting/dist_plots_check.R Executable file → Normal file
View file

0
scripts/plotting/extreme_muts.R Executable file → Normal file
View file

0
scripts/plotting/get_plotting_dfs.R Executable file → Normal file
View file

0
scripts/plotting/ggcorr_all_PS_LIG.R Executable file → Normal file
View file

0
scripts/plotting/hist_af_or_base.R Executable file → Normal file
View file

0
scripts/plotting/hist_af_or_combined.R Executable file → Normal file
View file

0
scripts/plotting/legend_adjustment.R Executable file → Normal file
View file

0
scripts/plotting/opp_mcsm_muts.R Executable file → Normal file
View file

0
scripts/plotting/or_plots_combined.R Executable file → Normal file
View file

0
scripts/plotting/other_plots_combined.R Executable file → Normal file
View file

0
scripts/plotting/output_tables.R Executable file → Normal file
View file

0
scripts/plotting/ps_plots_combined.R Executable file → Normal file
View file

0
scripts/plotting/resolving_ambiguous_muts.R Executable file → Normal file
View file