handled rpob 5uhc position offset in mcsm_ppi2
This commit is contained in:
parent
46e2c93885
commit
00b84ccb1c
30 changed files with 395 additions and 63 deletions
0
dynamut/format_results_dynamut.py
Executable file → Normal file
0
dynamut/format_results_dynamut.py
Executable file → Normal file
0
dynamut/format_results_dynamut2.py
Executable file → Normal file
0
dynamut/format_results_dynamut2.py
Executable file → Normal file
0
dynamut/run_format_results_dynamut.py
Executable file → Normal file
0
dynamut/run_format_results_dynamut.py
Executable file → Normal file
|
@ -31,7 +31,11 @@ split ../../../${INFILE} -l ${CHUNK} -d snp_batch_
|
||||||
#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh alr_mcsm_formatted_snps_chain.csv snp_batches 50
|
#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh alr_mcsm_formatted_snps_chain.csv snp_batches 50
|
||||||
|
|
||||||
# Date: 05/10/2021
|
# Date: 05/10/2021
|
||||||
#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh alr_mcsm_formatted_snps_chain.csv snp_batches 20
|
#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh alr_mcsm_formatted_snps_chain.csv snp_batches 20
|
||||||
|
|
||||||
|
# Date: 30/11/2021
|
||||||
|
#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh katg_mcsm_formatted_snps_chain.csv snp_batches 20
|
||||||
|
for i in {00..40}; do mv snp_batch_${i} snp_batch_${i}.txt; done
|
||||||
|
|
||||||
# add .txt to the files
|
# add .txt to the files
|
||||||
########################################################################
|
########################################################################
|
||||||
|
|
0
mcsm_na/examples.py
Executable file → Normal file
0
mcsm_na/examples.py
Executable file → Normal file
0
mcsm_na/format_results_mcsm_na.py
Executable file → Normal file
0
mcsm_na/format_results_mcsm_na.py
Executable file → Normal file
0
mcsm_na/get_results_mcsm_na.py
Executable file → Normal file
0
mcsm_na/get_results_mcsm_na.py
Executable file → Normal file
0
mcsm_na/run_format_results_mcsm_na.py
Executable file → Normal file
0
mcsm_na/run_format_results_mcsm_na.py
Executable file → Normal file
0
mcsm_na/submit_mcsm_na.py
Executable file → Normal file
0
mcsm_na/submit_mcsm_na.py
Executable file → Normal file
|
@ -24,7 +24,7 @@ from reference_dict import up_3letter_aa_dict
|
||||||
from reference_dict import oneletter_aa_dict
|
from reference_dict import oneletter_aa_dict
|
||||||
#%%============================================================================
|
#%%============================================================================
|
||||||
|
|
||||||
def format_mcsm_ppi2_output(mcsm_ppi2_output_csv):
|
def format_mcsm_ppi2_output(mcsm_ppi2_output_csv, gene_name):
|
||||||
"""
|
"""
|
||||||
@param mcsm_ppi2_output_csv: file containing mcsm_ppi2_results for all mcsm snps
|
@param mcsm_ppi2_output_csv: file containing mcsm_ppi2_results for all mcsm snps
|
||||||
which is the result of combining all mcsm_ppi2 batch results, and using
|
which is the result of combining all mcsm_ppi2 batch results, and using
|
||||||
|
@ -78,30 +78,57 @@ def format_mcsm_ppi2_output(mcsm_ppi2_output_csv):
|
||||||
|
|
||||||
# # check
|
# # check
|
||||||
# mcsm_ppi2_data['wild-type'].equals(mcsm_ppi2_data['WILD'])
|
# mcsm_ppi2_data['wild-type'].equals(mcsm_ppi2_data['WILD'])
|
||||||
# mcsm_ppi2_data['mutant'].equals(mcsm_ppi2_data['MUT'])
|
# mcsm_ppi2_data['mutant'].equals(mcsm_ppi2_data['MUT'])
|
||||||
#%%============================================================================
|
#%%=====================================================================
|
||||||
#############
|
# add offset specified position number for rpob since 5uhc with chain 'C' was
|
||||||
# rename cols
|
# used to run the analysis
|
||||||
#############
|
|
||||||
# format colnames: all lowercase and consistent colnames
|
|
||||||
mcsm_ppi2_data.columns
|
|
||||||
print('Assigning meaningful colnames'
|
|
||||||
, '\n=======================================================')
|
|
||||||
|
|
||||||
my_colnames_dict = {'chain': 'chain'
|
|
||||||
, 'wild-type': 'wt_upper'
|
|
||||||
, 'res-number': 'position'
|
|
||||||
, 'mutant': 'mut_upper'
|
|
||||||
, 'distance-to-interface': 'interface_dist'
|
|
||||||
, 'mcsm-ppi2-prediction': 'mcsm_ppi2_affinity'
|
|
||||||
, 'affinity': 'mcsm_ppi2_outcome'
|
|
||||||
, 'w_type': 'wild_type' # one letter amino acid code
|
|
||||||
, 'm_type': 'mutant_type' # one letter amino acid code
|
|
||||||
}
|
|
||||||
|
|
||||||
|
geneL_sp = ['rpob']
|
||||||
|
if gene_name.lower() in geneL_sp:
|
||||||
|
offset = 6
|
||||||
|
chain_orig = 'A'
|
||||||
|
|
||||||
|
# Add offset corrected position number. matching with rpob nsSNPs used for mCSM-lig
|
||||||
|
# and also add corresponding chain id matching with rpob nsSNPs used for mCSM-lig
|
||||||
|
mcsm_ppi2_data['position'] = mcsm_ppi2_data['res-number'] - offset
|
||||||
|
mcsm_ppi2_data['chain'] = chain_orig
|
||||||
|
mcsm_ppi2_data['5uhc_offset'] = offset
|
||||||
|
|
||||||
|
#############
|
||||||
|
# rename cols
|
||||||
|
#############
|
||||||
|
# format colnames: all lowercase and consistent colnames
|
||||||
|
mcsm_ppi2_data.columns
|
||||||
|
print('Assigning meaningful colnames'
|
||||||
|
, '\n=======================================================')
|
||||||
|
|
||||||
|
my_colnames_dict = {'chain' : 'chain'
|
||||||
|
, 'position' : 'position'
|
||||||
|
, '5uhc_offset' : '5uhc_offset'
|
||||||
|
, 'wild-type' : 'wt_upper'
|
||||||
|
, 'res-number' : '5uhc_position'
|
||||||
|
, 'mutant' : 'mut_upper'
|
||||||
|
, 'distance-to-interface': 'interface_dist'
|
||||||
|
, 'mcsm-ppi2-prediction' : 'mcsm_ppi2_affinity'
|
||||||
|
, 'affinity' : 'mcsm_ppi2_outcome'
|
||||||
|
, 'w_type' : 'wild_type' # one letter amino acid code
|
||||||
|
, 'm_type' : 'mutant_type' # one letter amino acid code
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
my_colnames_dict = {'chain' : 'chain'
|
||||||
|
, 'wild-type' : 'wt_upper'
|
||||||
|
, 'res-number' : 'position'
|
||||||
|
, 'mutant' : 'mut_upper'
|
||||||
|
, 'distance-to-interface': 'interface_dist'
|
||||||
|
, 'mcsm-ppi2-prediction' : 'mcsm_ppi2_affinity'
|
||||||
|
, 'affinity' : 'mcsm_ppi2_outcome'
|
||||||
|
, 'w_type' : 'wild_type' # one letter amino acid code
|
||||||
|
, 'm_type' : 'mutant_type' # one letter amino acid code
|
||||||
|
}
|
||||||
|
#%%==============================================================================
|
||||||
mcsm_ppi2_data.rename(columns = my_colnames_dict, inplace = True)
|
mcsm_ppi2_data.rename(columns = my_colnames_dict, inplace = True)
|
||||||
mcsm_ppi2_data.columns
|
mcsm_ppi2_data.columns
|
||||||
|
|
||||||
#############
|
#############
|
||||||
# create mutationinformation column
|
# create mutationinformation column
|
||||||
#############
|
#############
|
||||||
|
@ -137,22 +164,47 @@ def format_mcsm_ppi2_output(mcsm_ppi2_output_csv):
|
||||||
, '\nExpected number:', mcsm_ppi2_pos
|
, '\nExpected number:', mcsm_ppi2_pos
|
||||||
, '\nGot:', mcsm_ppi2_pos2
|
, '\nGot:', mcsm_ppi2_pos2
|
||||||
, '\n======================================================')
|
, '\n======================================================')
|
||||||
|
|
||||||
#%%=====================================================================
|
#%%=====================================================================
|
||||||
#############
|
###################
|
||||||
# reorder columns
|
# reorder columns
|
||||||
#############
|
###################
|
||||||
mcsm_ppi2_data.columns
|
mcsm_ppi2_data.columns
|
||||||
mcsm_ppi2_dataf = mcsm_ppi2_data[['mutationinformation'
|
|
||||||
, 'mcsm_ppi2_affinity'
|
#---------------------
|
||||||
, 'mcsm_ppi2_scaled'
|
# Determine col order
|
||||||
, 'mcsm_ppi2_outcome'
|
#---------------------
|
||||||
, 'interface_dist'
|
|
||||||
, 'wild_type'
|
core_cols = ['mutationinformation'
|
||||||
, 'position'
|
, 'mcsm_ppi2_affinity'
|
||||||
, 'mutant_type'
|
, 'mcsm_ppi2_scaled'
|
||||||
, 'wt_upper'
|
, 'mcsm_ppi2_outcome'
|
||||||
, 'mut_upper'
|
, 'interface_dist'
|
||||||
, 'chain']]
|
, 'wild_type'
|
||||||
|
, 'position'
|
||||||
|
, 'mutant_type'
|
||||||
|
, 'wt_upper'
|
||||||
|
, 'mut_upper'
|
||||||
|
, 'chain']
|
||||||
|
|
||||||
|
if gene_name.lower() in geneL_sp:
|
||||||
|
|
||||||
|
column_order = core_cols + ['5uhc_offset', '5uhc_position']
|
||||||
|
|
||||||
|
else:
|
||||||
|
|
||||||
|
column_order = core_cols.copy()
|
||||||
|
|
||||||
|
#--------------
|
||||||
|
# reorder now
|
||||||
|
#--------------
|
||||||
|
mcsm_ppi2_dataf = mcsm_ppi2_data[column_order]
|
||||||
|
|
||||||
|
#%%============================================================================
|
||||||
|
###################
|
||||||
|
# Sort df based on
|
||||||
|
# position columns
|
||||||
|
###################
|
||||||
|
mcsm_ppi2_dataf.sort_values(by = ['position', 'mutant_type'], inplace = True, ascending = True)
|
||||||
|
|
||||||
return(mcsm_ppi2_dataf)
|
return(mcsm_ppi2_dataf)
|
||||||
#%%#####################################################################
|
#%%#####################################################################
|
|
@ -67,7 +67,7 @@ outfile_mcsm_ppi2_f = outdir_ppi2 + gene.lower() + '_complex_mcsm_ppi2_norm.csv'
|
||||||
# Data: gid+streptomycin
|
# Data: gid+streptomycin
|
||||||
#==========================
|
#==========================
|
||||||
print('Formatting results for:', infile_mcsm_ppi2)
|
print('Formatting results for:', infile_mcsm_ppi2)
|
||||||
mcsm_ppi2_df_f = format_mcsm_ppi2_output(mcsm_ppi2_output_csv = infile_mcsm_ppi2)
|
mcsm_ppi2_df_f = format_mcsm_ppi2_output(mcsm_ppi2_output_csv = infile_mcsm_ppi2, gene_name = gene)
|
||||||
|
|
||||||
# writing file
|
# writing file
|
||||||
print('Writing formatted df to csv')
|
print('Writing formatted df to csv')
|
||||||
|
|
|
@ -53,7 +53,7 @@ homedir = os.path.expanduser('~')
|
||||||
|
|
||||||
# set working dir
|
# set working dir
|
||||||
os.getcwd()
|
os.getcwd()
|
||||||
#os.chdir(homedir + '/git/LSHTM_analysis/scripts')
|
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
|
||||||
os.getcwd()
|
os.getcwd()
|
||||||
|
|
||||||
# FIXME: local imports
|
# FIXME: local imports
|
||||||
|
@ -170,6 +170,18 @@ infilename_mcsm_f_snps = gene.lower() + '_mcsm_formatted_snps.csv'
|
||||||
infile_mcsm_f_snps = outdir + infilename_mcsm_f_snps
|
infile_mcsm_f_snps = outdir + infilename_mcsm_f_snps
|
||||||
mcsm_f_snps = pd.read_csv(infile_mcsm_f_snps, sep = ',', names = ['mutationinformation'], header = None)
|
mcsm_f_snps = pd.read_csv(infile_mcsm_f_snps, sep = ',', names = ['mutationinformation'], header = None)
|
||||||
|
|
||||||
|
# more output added
|
||||||
|
## consurf [change colnames]
|
||||||
|
|
||||||
|
infilename_consurf = gene.lower() + '_consurf_grades_f.csv'
|
||||||
|
infile_consurf = outdir + 'consurf/'+ infilename_consurf
|
||||||
|
consurf_df = pd.read_csv(infile_consurf, sep = ',')
|
||||||
|
|
||||||
|
## SNAP2 [add normalised score]
|
||||||
|
infilename_snap2 = gene.lower() + '_snap2_output.csv'
|
||||||
|
infile_snap2 = outdir + 'snap2/'+ infilename_snap2
|
||||||
|
snap2_df = pd.read_csv(infile_snap2, sep = ',')
|
||||||
|
|
||||||
#------------------------------------------------------------------------------
|
#------------------------------------------------------------------------------
|
||||||
# ONLY:for gene pnca and gid: End logic should pick this up!
|
# ONLY:for gene pnca and gid: End logic should pick this up!
|
||||||
geneL_na = ['gid', 'rpob']
|
geneL_na = ['gid', 'rpob']
|
||||||
|
@ -196,7 +208,7 @@ if gene.lower() in geneL_dy:
|
||||||
# infile_mcsm_na = outdir + 'mcsm_na_results/' + infilename_mcsm_na
|
# infile_mcsm_na = outdir + 'mcsm_na_results/' + infilename_mcsm_na
|
||||||
# mcsm_na_df = pd.read_csv(infile_mcsm_na, sep = ',')
|
# mcsm_na_df = pd.read_csv(infile_mcsm_na, sep = ',')
|
||||||
|
|
||||||
# ONLY:for gene embb and alr: End logic should pick this up!
|
# ONLY:for gene embb and alr and katg: End logic should pick this up!
|
||||||
geneL_ppi2 = ['embb', 'alr']
|
geneL_ppi2 = ['embb', 'alr']
|
||||||
#if gene.lower() == "embb" or "alr":
|
#if gene.lower() == "embb" or "alr":
|
||||||
if gene.lower() in geneL_ppi2:
|
if gene.lower() in geneL_ppi2:
|
||||||
|
@ -381,6 +393,247 @@ else:
|
||||||
if deepddg_df['deepddg_scaled'].min() == -1 and deepddg_df['deepddg_scaled'].max() == 1:
|
if deepddg_df['deepddg_scaled'].min() == -1 and deepddg_df['deepddg_scaled'].max() == 1:
|
||||||
print('\nPASS: Deepddg data is scaled between -1 and 1',
|
print('\nPASS: Deepddg data is scaled between -1 and 1',
|
||||||
'\nproceeding with merge')
|
'\nproceeding with merge')
|
||||||
|
|
||||||
|
#=======================
|
||||||
|
# Consurf
|
||||||
|
#=======================
|
||||||
|
consurf_df.shape
|
||||||
|
|
||||||
|
# drop row 0: as it contains no value but hangover text
|
||||||
|
consurf_df = consurf_df.drop(index=0)
|
||||||
|
|
||||||
|
#----------------------
|
||||||
|
# rename colums
|
||||||
|
#----------------------
|
||||||
|
consurf_df.columns
|
||||||
|
print('\nRenaming cols and assigning pretty column names')
|
||||||
|
|
||||||
|
geneL_consurf = ['alr', 'katg', 'rpob']
|
||||||
|
|
||||||
|
if gene.lower() in geneL_consurf:
|
||||||
|
consurf_df = consurf_df.rename(columns={'POS' : 'position_consurf'})
|
||||||
|
#---------------------------
|
||||||
|
# Specify the offset
|
||||||
|
#---------------------------
|
||||||
|
print('\nAdding offset value for gene:', gene.lower())
|
||||||
|
|
||||||
|
if gene.lower() == 'alr':
|
||||||
|
offset_val = 34
|
||||||
|
|
||||||
|
print('\nUsing offset val:', offset_val)
|
||||||
|
if gene.lower() == 'katg':
|
||||||
|
offset_val = 23
|
||||||
|
print('\nUsing offset val:', offset_val)
|
||||||
|
|
||||||
|
if gene.lower() == 'rpob':
|
||||||
|
offset_val = 28
|
||||||
|
print('\nUsing offset val:', offset_val)
|
||||||
|
|
||||||
|
consurf_df['position'] = consurf_df['position_consurf'] + offset_val
|
||||||
|
|
||||||
|
else:
|
||||||
|
consurf_df = consurf_df.rename(columns={'POS' : 'position'})
|
||||||
|
|
||||||
|
consurf_df = consurf_df.rename(columns={'SEQ' : 'wild_type'
|
||||||
|
, '3LATOM': 'wt_3upper'
|
||||||
|
, 'SCORE' : 'consurf_score'
|
||||||
|
, 'COLOR' : 'consurf_colour_str'
|
||||||
|
, 'CONFIDENCEINTERVAL' : 'consurf_ci'
|
||||||
|
, 'CONFIDENCEINTERVALCOLORS' : 'consurf_ci_colour'
|
||||||
|
, 'MSADATA' : 'consurf_msa_data'
|
||||||
|
, 'RESIDUEVARIETY' : 'consurf_aa_variety'})
|
||||||
|
# quick check
|
||||||
|
if len(consurf_df) == len(rd_df):
|
||||||
|
print('\nPASS: length of consurf df is as expected'
|
||||||
|
, '\nProceeding to format consurf df')
|
||||||
|
else:
|
||||||
|
print('\nFAIL: length mismatch'
|
||||||
|
, '\nExpected nrows:', len(rd_df)
|
||||||
|
, '\nGot:', len(consurf_df))
|
||||||
|
|
||||||
|
consurf_df.dtypes
|
||||||
|
consurf_df['consurf_score'] = consurf_df['consurf_score'].astype(float)
|
||||||
|
|
||||||
|
consurf_df['consurf_colour'] = consurf_df['consurf_colour_str'].str.extract(r'(\d).*')
|
||||||
|
consurf_df['consurf_colour'] = consurf_df['consurf_colour'].astype(int)
|
||||||
|
|
||||||
|
consurf_df['consurf_colour_rev'] = consurf_df['consurf_colour_str'].str.replace(r'.\*','0')
|
||||||
|
consurf_df['consurf_colour_rev'] = consurf_df['consurf_colour_rev'].astype(int)
|
||||||
|
|
||||||
|
consurf_df['consurf_ci_upper'] = consurf_df['consurf_ci'].str.extract(r'(.*):')
|
||||||
|
consurf_df['consurf_ci_upper'] = consurf_df['consurf_ci_upper'].astype(float)
|
||||||
|
|
||||||
|
consurf_df['consurf_ci_lower'] = consurf_df['consurf_ci'].str.extract(r':(.*)')
|
||||||
|
consurf_df['consurf_ci_lower'] = consurf_df['consurf_ci_lower'].astype(float)
|
||||||
|
|
||||||
|
#consurf_df['wt_3upper_f'] = consurf_df['wt_3upper'].str.extract(r'^\w{3}(\d+.*)')
|
||||||
|
#consurf_df['wt_3upper_f']
|
||||||
|
consurf_df['wt_3upper'] = consurf_df['wt_3upper'].str.replace(r'(\d+:.*)', '')
|
||||||
|
|
||||||
|
consurf_df['chain'] = consurf_df['wt_3upper'].str.extract(r':(.*)')
|
||||||
|
|
||||||
|
#-------------------------
|
||||||
|
# scale consurf values
|
||||||
|
#-------------------------
|
||||||
|
# Rescale values in consurf_score col b/w -1 and 1 so negative numbers
|
||||||
|
# stay neg and pos numbers stay positive
|
||||||
|
consurf_min = consurf_df['consurf_score'].min()
|
||||||
|
consurf_max = consurf_df['consurf_score'].max()
|
||||||
|
consurf_min
|
||||||
|
consurf_max
|
||||||
|
|
||||||
|
# quick check
|
||||||
|
len(consurf_df.loc[consurf_df['consurf_score'] >= 0])
|
||||||
|
len(consurf_df.loc[consurf_df['consurf_score'] < 0])
|
||||||
|
|
||||||
|
consurf_scale = lambda x : x/abs(consurf_min) if x < 0 else (x/consurf_max if x >= 0 else 'failed')
|
||||||
|
|
||||||
|
consurf_df['consurf_scaled'] = consurf_df['consurf_score'].apply(consurf_scale)
|
||||||
|
print('\nRaw consurf scores:\n', consurf_df['consurf_score']
|
||||||
|
, '\n---------------------------------------------------------------'
|
||||||
|
, '\nScaled consurf scores:\n', consurf_df['consurf_scaled'])
|
||||||
|
|
||||||
|
# additional check added
|
||||||
|
csmi = consurf_df['consurf_scaled'].min()
|
||||||
|
csma = consurf_df['consurf_scaled'].max()
|
||||||
|
|
||||||
|
c = consurf_df[consurf_df['consurf_score']>=0].count()
|
||||||
|
consurf_pos = c.get(key = 'consurf_score')
|
||||||
|
|
||||||
|
c2 = consurf_df[consurf_df['consurf_scaled']>=0].count()
|
||||||
|
consurf_pos2 = c2.get(key = 'consurf_scaled')
|
||||||
|
|
||||||
|
if consurf_pos == consurf_pos2 and csmi == -1 and csma == 1:
|
||||||
|
print('\nPASS: Consurf values scaled correctly b/w -1 and 1')
|
||||||
|
else:
|
||||||
|
print('\nFAIL: Consurf values scaled numbers MISmatch'
|
||||||
|
, '\nExpected number:', consurf_pos
|
||||||
|
, '\nGot:', consurf_pos2
|
||||||
|
, '\n======================================================')
|
||||||
|
|
||||||
|
consurf_df.dtypes
|
||||||
|
consurf_df.columns
|
||||||
|
|
||||||
|
#---------------------------
|
||||||
|
# select columns
|
||||||
|
# (and also determine order)
|
||||||
|
#---------------------------
|
||||||
|
consurf_df_f = consurf_df[['position'
|
||||||
|
, 'wild_type'
|
||||||
|
, 'chain'
|
||||||
|
, 'wt_3upper'
|
||||||
|
, 'consurf_score'
|
||||||
|
, 'consurf_scaled'
|
||||||
|
, 'consurf_colour'
|
||||||
|
, 'consurf_colour_rev'
|
||||||
|
, 'consurf_ci_upper'
|
||||||
|
, 'consurf_ci_lower'
|
||||||
|
, 'consurf_ci_colour'
|
||||||
|
, 'consurf_msa_data'
|
||||||
|
, 'consurf_aa_variety']]
|
||||||
|
|
||||||
|
#=======================
|
||||||
|
# SNAP2
|
||||||
|
#=======================
|
||||||
|
snap2_df.shape
|
||||||
|
|
||||||
|
#----------------------
|
||||||
|
# rename colums
|
||||||
|
#----------------------
|
||||||
|
geneL_snap2 = ['alr', 'katg', 'rpob']
|
||||||
|
|
||||||
|
if gene.lower() in geneL_snap2:
|
||||||
|
print('\nReading SNAP2 for gene:', gene.lower()
|
||||||
|
, '\nOffset column also being read'
|
||||||
|
, '\nRenaming columns...'
|
||||||
|
, '\nColumn mutationinformation exists. Renaming SNAP2 column variant --> mutationinformation')
|
||||||
|
|
||||||
|
snap2_df = snap2_df.rename(columns = {'mutationinformation': 'mutationinformation'
|
||||||
|
, 'Variant' : 'mutationinformation_snap2'
|
||||||
|
, 'Predicted Effect' : 'snap2_outcome'
|
||||||
|
, 'Score' : 'snap2_score'
|
||||||
|
, 'Expected Accuracy': 'snap2_accuracy_pc'})
|
||||||
|
else:
|
||||||
|
print('\nReading SNAP2 for gene:', gene.lower()
|
||||||
|
, '\nNo offset column for SNAP2'
|
||||||
|
, '\nRenaming columns...'
|
||||||
|
, '\nRenaming SNAP2 column variant --> mutationinformation')
|
||||||
|
|
||||||
|
snap2_df = snap2_df.rename(columns = {'Variant' : 'mutationinformation'
|
||||||
|
, 'Predicted Effect' : 'snap2_outcome'
|
||||||
|
, 'Score' : 'snap2_score'
|
||||||
|
, 'Expected Accuracy': 'snap2_accuracy_pc'})
|
||||||
|
|
||||||
|
snap2_df.columns
|
||||||
|
snap2_df.head()
|
||||||
|
snap2_df.dtypes
|
||||||
|
|
||||||
|
snap2_df['snap2_accuracy_pc'] = snap2_df['snap2_accuracy_pc'].str.replace('%','')
|
||||||
|
snap2_df['snap2_accuracy_pc'] = snap2_df['snap2_accuracy_pc'].astype(int)
|
||||||
|
|
||||||
|
#-------------------------
|
||||||
|
# scale snap2 values
|
||||||
|
#-------------------------
|
||||||
|
# Rescale values in snap2_score col b/w -1 and 1 so negative numbers
|
||||||
|
# stay neg and pos numbers stay positive
|
||||||
|
snap2_min = snap2_df['snap2_score'].min()
|
||||||
|
snap2_max = snap2_df['snap2_score'].max()
|
||||||
|
snap2_min
|
||||||
|
snap2_max
|
||||||
|
|
||||||
|
# quick check
|
||||||
|
len(snap2_df.loc[snap2_df['snap2_score'] >= 0])
|
||||||
|
len(snap2_df.loc[snap2_df['snap2_score'] < 0])
|
||||||
|
|
||||||
|
snap2_scale = lambda x : x/abs(snap2_min) if x < 0 else (x/snap2_max if x >= 0 else 'failed')
|
||||||
|
|
||||||
|
snap2_df['snap2_scaled'] = snap2_df['snap2_score'].apply(snap2_scale)
|
||||||
|
print('\nRaw snap2 scores:\n', snap2_df['snap2_score']
|
||||||
|
, '\n---------------------------------------------------------------'
|
||||||
|
, '\nScaled snap2 scores:\n', snap2_df['snap2_scaled'])
|
||||||
|
|
||||||
|
# additional check added
|
||||||
|
ssmi = snap2_df['snap2_scaled'].min()
|
||||||
|
ssma = snap2_df['snap2_scaled'].max()
|
||||||
|
|
||||||
|
sn = snap2_df[snap2_df['snap2_score']>=0].count()
|
||||||
|
snap2_pos = sn.get(key = 'snap2_score')
|
||||||
|
|
||||||
|
sn2 = snap2_df[snap2_df['snap2_scaled']>=0].count()
|
||||||
|
snap2_pos2 = sn2.get(key = 'snap2_scaled')
|
||||||
|
|
||||||
|
if snap2_pos == snap2_pos2 and csmi == -1 and csma == 1:
|
||||||
|
print('\nPASS: Snap2 values scaled correctly b/w -1 and 1')
|
||||||
|
else:
|
||||||
|
print('\nFAIL: snap2 values scaled numbers MISmatch'
|
||||||
|
, '\nExpected number:', snap2_pos
|
||||||
|
, '\nGot:', snap2_pos2
|
||||||
|
, '\n======================================================')
|
||||||
|
|
||||||
|
#---------------------------
|
||||||
|
# select columns
|
||||||
|
# (and also determine order)
|
||||||
|
#---------------------------
|
||||||
|
snap2_df.dtypes
|
||||||
|
snap2_df.columns
|
||||||
|
|
||||||
|
geneL_snap2 = ['alr', 'katg', 'rpob']
|
||||||
|
|
||||||
|
if gene.lower() in geneL_snap2:
|
||||||
|
print('\nSelecting cols SNAP2 for gene:', gene.lower())
|
||||||
|
snap2_df_f = snap2_df[['mutationinformation'
|
||||||
|
, 'mutationinformation_snap2'
|
||||||
|
, 'snap2_score'
|
||||||
|
, 'snap2_scaled'
|
||||||
|
, 'snap2_accuracy_pc'
|
||||||
|
, 'snap2_outcome']]
|
||||||
|
else:
|
||||||
|
print('\nSelecting cols SNAP2 for gene:', gene.lower())
|
||||||
|
snap2_df_f = snap2_df[['mutationinformation'
|
||||||
|
, 'snap2_score'
|
||||||
|
, 'snap2_scaled'
|
||||||
|
, 'snap2_accuracy_pc'
|
||||||
|
, 'snap2_outcome']]
|
||||||
|
|
||||||
#%%============================================================================
|
#%%============================================================================
|
||||||
# Now merges begin
|
# Now merges begin
|
||||||
|
@ -499,7 +752,9 @@ merging_cols_m2 = detect_common_cols(dssp_df, kd_df)
|
||||||
dssp_kd_dfs = pd.merge(dssp_df
|
dssp_kd_dfs = pd.merge(dssp_df
|
||||||
, kd_df
|
, kd_df
|
||||||
, on = merging_cols_m2
|
, on = merging_cols_m2
|
||||||
, how = "outer")
|
#, how = "outer")
|
||||||
|
, how = "inner")
|
||||||
|
|
||||||
|
|
||||||
print('\n\nResult of third merge:', dssp_kd_dfs.shape
|
print('\n\nResult of third merge:', dssp_kd_dfs.shape
|
||||||
, '\n===================================================================')
|
, '\n===================================================================')
|
||||||
|
@ -521,6 +776,26 @@ print('\n\nResult of Third merge:', dssp_kd_rd_dfs.shape
|
||||||
, '\n===================================================================')
|
, '\n===================================================================')
|
||||||
dssp_kd_rd_dfs[merging_cols_m3].apply(len)
|
dssp_kd_rd_dfs[merging_cols_m3].apply(len)
|
||||||
dssp_kd_rd_dfs[merging_cols_m3].apply(len) == len(dssp_kd_rd_dfs)
|
dssp_kd_rd_dfs[merging_cols_m3].apply(len) == len(dssp_kd_rd_dfs)
|
||||||
|
|
||||||
|
#%%============================================================================
|
||||||
|
print('==================================='
|
||||||
|
, '\nFourth merge*: fourth merge + consurf_df'
|
||||||
|
, '\dssp_kd_rd_dfs + consurf_df'
|
||||||
|
, '\n===================================')
|
||||||
|
#dssp_kd_rd_dfs = combine_dfs_with_checks(dssp_kd_dfs, rd_df, my_join = "outer")
|
||||||
|
merging_cols_m3_v2 = detect_common_cols(dssp_kd_rd_dfs, consurf_df)
|
||||||
|
dssp_kd_rd_con_dfs = pd.merge(dssp_kd_rd_dfs
|
||||||
|
, consurf_df
|
||||||
|
, on = merging_cols_m3_v2
|
||||||
|
, how = "outer")
|
||||||
|
|
||||||
|
ncols_m3_v2 = len(dssp_kd_rd_con_dfs.columns)
|
||||||
|
|
||||||
|
print('\n\nResult of fourth merge*:', dssp_kd_rd_con_dfs.shape
|
||||||
|
, '\n===================================================================')
|
||||||
|
dssp_kd_rd_con_dfs[merging_cols_m3_v2].apply(len)
|
||||||
|
dssp_kd_rd_con_dfs[merging_cols_m3_v2].apply(len) == len(dssp_kd_rd_con_dfs)
|
||||||
|
|
||||||
#%%============================================================================
|
#%%============================================================================
|
||||||
print('======================================='
|
print('======================================='
|
||||||
, '\nFifth merge: Second merge + fourth merge'
|
, '\nFifth merge: Second merge + fourth merge'
|
||||||
|
|
|
@ -75,15 +75,14 @@ args = arg_parser.parse_args()
|
||||||
drug = args.drug
|
drug = args.drug
|
||||||
gene = args.gene
|
gene = args.gene
|
||||||
|
|
||||||
#drug = 'pyrazinamide'
|
|
||||||
#gene = 'pncA'
|
|
||||||
|
|
||||||
gene_match = gene + '_p.'
|
gene_match = gene + '_p.'
|
||||||
print('mut pattern for gene', gene, ':', gene_match)
|
print('mut pattern for gene', gene, ':', gene_match)
|
||||||
|
|
||||||
nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}'
|
nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}'
|
||||||
print('nsSNP for gene', gene, ':', nssnp_match)
|
print('nsSNP for gene', gene, ':', nssnp_match)
|
||||||
|
|
||||||
|
nssnp_match2 = re.compile(nssnp_match)
|
||||||
|
|
||||||
wt_regex = gene_match.lower()+'([A-Za-z]{3})'
|
wt_regex = gene_match.lower()+'([A-Za-z]{3})'
|
||||||
print('wt regex:', wt_regex)
|
print('wt regex:', wt_regex)
|
||||||
|
|
||||||
|
@ -219,20 +218,21 @@ meta_gene_epi = meta_gene_multi.loc[(meta_gene_multi['dr_mult_snp_count']>1) | (
|
||||||
|
|
||||||
#%% TEST
|
#%% TEST
|
||||||
# formatting, replace !nssnp_match with nothing
|
# formatting, replace !nssnp_match with nothing
|
||||||
foo1 = 'pncA_p.Thr47Pro;pncA_p.Thr61Pro;rpsA_c.XX'
|
#foo1 = 'pncA_p.Thr47Pro;pncA_p.Thr61Pro;rpsA_c.XX'
|
||||||
foo2 = 'pncA_Chromosome:g.2288693_2289280del; WT; pncA_p.Thr61Ala'
|
#foo2 = 'pncA_Chromosome:g.2288693_2289280del; WT; pncA_p.Thr61Ala'
|
||||||
|
|
||||||
|
|
||||||
foo1_s = foo1.split(';')
|
#foo1_s = foo1.split(';')
|
||||||
foo1_s
|
#foo1_s
|
||||||
nssnp_match2 = re.compile('(pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})')
|
#nssnp_match2 = re.compile('(pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})')
|
||||||
arse=list(filter(nssnp_match2.match, foo1_s))
|
#arse=list(filter(nssnp_match2.match, foo1_s))
|
||||||
arse
|
#arse
|
||||||
|
|
||||||
|
#foo1_s2 = ';'.join(arse)
|
||||||
|
#foo1_s2
|
||||||
|
|
||||||
foo1_s2 = ';'.join(arse)
|
|
||||||
foo1_s2
|
|
||||||
#%%
|
#%%
|
||||||
nssnp_match2 = re.compile('(pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})')
|
#nssnp_match2 = re.compile('(pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})')
|
||||||
|
|
||||||
# dr_muts_col
|
# dr_muts_col
|
||||||
dr_clean_col = dr_muts_col + '_clean'
|
dr_clean_col = dr_muts_col + '_clean'
|
||||||
|
@ -248,6 +248,7 @@ for i, v in enumerate(meta_gene_epi[dr_muts_col]):
|
||||||
dr2_s = v.split(';')
|
dr2_s = v.split(';')
|
||||||
print(dr2_s)
|
print(dr2_s)
|
||||||
dr2_sf = list(filter(nssnp_match2.match, dr2_s))
|
dr2_sf = list(filter(nssnp_match2.match, dr2_s))
|
||||||
|
#dr2_sf = list(filter(nssnp_match.match, dr2_s))
|
||||||
print(dr2_sf)
|
print(dr2_sf)
|
||||||
dr2_sf2 = ';'.join(dr2_sf)
|
dr2_sf2 = ';'.join(dr2_sf)
|
||||||
meta_gene_epi[dr_clean_col].iloc[i] = dr2_sf2
|
meta_gene_epi[dr_clean_col].iloc[i] = dr2_sf2
|
||||||
|
@ -262,13 +263,13 @@ meta_gene_epi[other_clean_col] = ''
|
||||||
|
|
||||||
for i, v in enumerate(meta_gene_epi[other_muts_col]):
|
for i, v in enumerate(meta_gene_epi[other_muts_col]):
|
||||||
#print(i, v)
|
#print(i, v)
|
||||||
print('======================================================')
|
#print('======================================================')
|
||||||
print(i)
|
#print(i)
|
||||||
print(v)
|
#print(v)
|
||||||
other2_s = v.split(';')
|
other2_s = v.split(';')
|
||||||
print(other2_s)
|
#print(other2_s)
|
||||||
other2_sf = list(filter(nssnp_match2.match, other2_s))
|
other2_sf = list(filter(nssnp_match2.match, other2_s))
|
||||||
print(other2_sf)
|
#print(other2_sf)
|
||||||
other2_sf2 = ';'.join(other2_sf)
|
other2_sf2 = ';'.join(other2_sf)
|
||||||
meta_gene_epi[other_clean_col].iloc[i] = other2_sf2
|
meta_gene_epi[other_clean_col].iloc[i] = other2_sf2
|
||||||
|
|
||||||
|
@ -281,7 +282,8 @@ meta_gene_epi_f = meta_gene_epi[['id', 'sample'
|
||||||
, 'dr_mult_snp_count'
|
, 'dr_mult_snp_count'
|
||||||
, other_muts_col, other_clean_col
|
, other_muts_col, other_clean_col
|
||||||
, 'other_mult_snp_count']]
|
, 'other_mult_snp_count']]
|
||||||
meta_gene_epi_f.columns
|
#print(meta_gene_epi_f.columns)
|
||||||
|
print(meta_gene_epi_f)
|
||||||
|
|
||||||
cols_to_output = ['id', 'sample'
|
cols_to_output = ['id', 'sample'
|
||||||
, dr_clean_col
|
, dr_clean_col
|
||||||
|
@ -293,7 +295,6 @@ cols_to_output = ['id', 'sample'
|
||||||
meta_gene_epi_f2 = meta_gene_epi_f[cols_to_output]
|
meta_gene_epi_f2 = meta_gene_epi_f[cols_to_output]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#%%
|
#%%
|
||||||
# formatting, replace !nssnp_match with nothing
|
# formatting, replace !nssnp_match with nothing
|
||||||
#nssnp_neg_match = '(?!pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})'
|
#nssnp_neg_match = '(?!pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})'
|
||||||
|
|
|
@ -92,7 +92,7 @@ else:
|
||||||
infile_fasta = indir + '/' + in_filename_fasta
|
infile_fasta = indir + '/' + in_filename_fasta
|
||||||
print('Input fasta file:', infile_fasta
|
print('Input fasta file:', infile_fasta
|
||||||
, '\n============================================================')
|
, '\n============================================================')
|
||||||
|
|
||||||
#=======
|
#=======
|
||||||
# output
|
# output
|
||||||
#=======
|
#=======
|
||||||
|
|
0
scripts/plotting/basic_barplots_combined.R
Executable file → Normal file
0
scripts/plotting/basic_barplots_combined.R
Executable file → Normal file
0
scripts/plotting/corr_adjusted_PS_LIG.R
Executable file → Normal file
0
scripts/plotting/corr_adjusted_PS_LIG.R
Executable file → Normal file
0
scripts/plotting/dirs.R
Executable file → Normal file
0
scripts/plotting/dirs.R
Executable file → Normal file
0
scripts/plotting/dist_plots_check.R
Executable file → Normal file
0
scripts/plotting/dist_plots_check.R
Executable file → Normal file
0
scripts/plotting/extreme_muts.R
Executable file → Normal file
0
scripts/plotting/extreme_muts.R
Executable file → Normal file
0
scripts/plotting/get_plotting_dfs.R
Executable file → Normal file
0
scripts/plotting/get_plotting_dfs.R
Executable file → Normal file
0
scripts/plotting/ggcorr_all_PS_LIG.R
Executable file → Normal file
0
scripts/plotting/ggcorr_all_PS_LIG.R
Executable file → Normal file
0
scripts/plotting/hist_af_or_base.R
Executable file → Normal file
0
scripts/plotting/hist_af_or_base.R
Executable file → Normal file
0
scripts/plotting/hist_af_or_combined.R
Executable file → Normal file
0
scripts/plotting/hist_af_or_combined.R
Executable file → Normal file
0
scripts/plotting/legend_adjustment.R
Executable file → Normal file
0
scripts/plotting/legend_adjustment.R
Executable file → Normal file
0
scripts/plotting/opp_mcsm_muts.R
Executable file → Normal file
0
scripts/plotting/opp_mcsm_muts.R
Executable file → Normal file
0
scripts/plotting/or_plots_combined.R
Executable file → Normal file
0
scripts/plotting/or_plots_combined.R
Executable file → Normal file
0
scripts/plotting/other_plots_combined.R
Executable file → Normal file
0
scripts/plotting/other_plots_combined.R
Executable file → Normal file
0
scripts/plotting/output_tables.R
Executable file → Normal file
0
scripts/plotting/output_tables.R
Executable file → Normal file
0
scripts/plotting/ps_plots_combined.R
Executable file → Normal file
0
scripts/plotting/ps_plots_combined.R
Executable file → Normal file
0
scripts/plotting/resolving_ambiguous_muts.R
Executable file → Normal file
0
scripts/plotting/resolving_ambiguous_muts.R
Executable file → Normal file
Loading…
Add table
Add a link
Reference in a new issue