handled rpob 5uhc position offset in mcsm_ppi2
This commit is contained in:
parent
46e2c93885
commit
00b84ccb1c
30 changed files with 395 additions and 63 deletions
0
dynamut/format_results_dynamut.py
Executable file → Normal file
0
dynamut/format_results_dynamut.py
Executable file → Normal file
0
dynamut/format_results_dynamut2.py
Executable file → Normal file
0
dynamut/format_results_dynamut2.py
Executable file → Normal file
0
dynamut/run_format_results_dynamut.py
Executable file → Normal file
0
dynamut/run_format_results_dynamut.py
Executable file → Normal file
|
@ -33,5 +33,9 @@ split ../../../${INFILE} -l ${CHUNK} -d snp_batch_
|
|||
# Date: 05/10/2021
|
||||
#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh alr_mcsm_formatted_snps_chain.csv snp_batches 20
|
||||
|
||||
# Date: 30/11/2021
|
||||
#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh katg_mcsm_formatted_snps_chain.csv snp_batches 20
|
||||
for i in {00..40}; do mv snp_batch_${i} snp_batch_${i}.txt; done
|
||||
|
||||
# add .txt to the files
|
||||
########################################################################
|
||||
|
|
0
mcsm_na/examples.py
Executable file → Normal file
0
mcsm_na/examples.py
Executable file → Normal file
0
mcsm_na/format_results_mcsm_na.py
Executable file → Normal file
0
mcsm_na/format_results_mcsm_na.py
Executable file → Normal file
0
mcsm_na/get_results_mcsm_na.py
Executable file → Normal file
0
mcsm_na/get_results_mcsm_na.py
Executable file → Normal file
0
mcsm_na/run_format_results_mcsm_na.py
Executable file → Normal file
0
mcsm_na/run_format_results_mcsm_na.py
Executable file → Normal file
0
mcsm_na/submit_mcsm_na.py
Executable file → Normal file
0
mcsm_na/submit_mcsm_na.py
Executable file → Normal file
|
@ -24,7 +24,7 @@ from reference_dict import up_3letter_aa_dict
|
|||
from reference_dict import oneletter_aa_dict
|
||||
#%%============================================================================
|
||||
|
||||
def format_mcsm_ppi2_output(mcsm_ppi2_output_csv):
|
||||
def format_mcsm_ppi2_output(mcsm_ppi2_output_csv, gene_name):
|
||||
"""
|
||||
@param mcsm_ppi2_output_csv: file containing mcsm_ppi2_results for all mcsm snps
|
||||
which is the result of combining all mcsm_ppi2 batch results, and using
|
||||
|
@ -79,26 +79,53 @@ def format_mcsm_ppi2_output(mcsm_ppi2_output_csv):
|
|||
# # check
|
||||
# mcsm_ppi2_data['wild-type'].equals(mcsm_ppi2_data['WILD'])
|
||||
# mcsm_ppi2_data['mutant'].equals(mcsm_ppi2_data['MUT'])
|
||||
#%%============================================================================
|
||||
#############
|
||||
# rename cols
|
||||
#############
|
||||
# format colnames: all lowercase and consistent colnames
|
||||
mcsm_ppi2_data.columns
|
||||
print('Assigning meaningful colnames'
|
||||
, '\n=======================================================')
|
||||
#%%=====================================================================
|
||||
# add offset specified position number for rpob since 5uhc with chain 'C' was
|
||||
# used to run the analysis
|
||||
|
||||
my_colnames_dict = {'chain': 'chain'
|
||||
, 'wild-type': 'wt_upper'
|
||||
, 'res-number': 'position'
|
||||
, 'mutant': 'mut_upper'
|
||||
, 'distance-to-interface': 'interface_dist'
|
||||
, 'mcsm-ppi2-prediction': 'mcsm_ppi2_affinity'
|
||||
, 'affinity': 'mcsm_ppi2_outcome'
|
||||
, 'w_type': 'wild_type' # one letter amino acid code
|
||||
, 'm_type': 'mutant_type' # one letter amino acid code
|
||||
}
|
||||
geneL_sp = ['rpob']
|
||||
if gene_name.lower() in geneL_sp:
|
||||
offset = 6
|
||||
chain_orig = 'A'
|
||||
|
||||
# Add offset corrected position number. matching with rpob nsSNPs used for mCSM-lig
|
||||
# and also add corresponding chain id matching with rpob nsSNPs used for mCSM-lig
|
||||
mcsm_ppi2_data['position'] = mcsm_ppi2_data['res-number'] - offset
|
||||
mcsm_ppi2_data['chain'] = chain_orig
|
||||
mcsm_ppi2_data['5uhc_offset'] = offset
|
||||
|
||||
#############
|
||||
# rename cols
|
||||
#############
|
||||
# format colnames: all lowercase and consistent colnames
|
||||
mcsm_ppi2_data.columns
|
||||
print('Assigning meaningful colnames'
|
||||
, '\n=======================================================')
|
||||
|
||||
my_colnames_dict = {'chain' : 'chain'
|
||||
, 'position' : 'position'
|
||||
, '5uhc_offset' : '5uhc_offset'
|
||||
, 'wild-type' : 'wt_upper'
|
||||
, 'res-number' : '5uhc_position'
|
||||
, 'mutant' : 'mut_upper'
|
||||
, 'distance-to-interface': 'interface_dist'
|
||||
, 'mcsm-ppi2-prediction' : 'mcsm_ppi2_affinity'
|
||||
, 'affinity' : 'mcsm_ppi2_outcome'
|
||||
, 'w_type' : 'wild_type' # one letter amino acid code
|
||||
, 'm_type' : 'mutant_type' # one letter amino acid code
|
||||
}
|
||||
else:
|
||||
my_colnames_dict = {'chain' : 'chain'
|
||||
, 'wild-type' : 'wt_upper'
|
||||
, 'res-number' : 'position'
|
||||
, 'mutant' : 'mut_upper'
|
||||
, 'distance-to-interface': 'interface_dist'
|
||||
, 'mcsm-ppi2-prediction' : 'mcsm_ppi2_affinity'
|
||||
, 'affinity' : 'mcsm_ppi2_outcome'
|
||||
, 'w_type' : 'wild_type' # one letter amino acid code
|
||||
, 'm_type' : 'mutant_type' # one letter amino acid code
|
||||
}
|
||||
#%%==============================================================================
|
||||
mcsm_ppi2_data.rename(columns = my_colnames_dict, inplace = True)
|
||||
mcsm_ppi2_data.columns
|
||||
|
||||
|
@ -137,22 +164,47 @@ def format_mcsm_ppi2_output(mcsm_ppi2_output_csv):
|
|||
, '\nExpected number:', mcsm_ppi2_pos
|
||||
, '\nGot:', mcsm_ppi2_pos2
|
||||
, '\n======================================================')
|
||||
|
||||
#%%=====================================================================
|
||||
#############
|
||||
###################
|
||||
# reorder columns
|
||||
#############
|
||||
###################
|
||||
mcsm_ppi2_data.columns
|
||||
mcsm_ppi2_dataf = mcsm_ppi2_data[['mutationinformation'
|
||||
, 'mcsm_ppi2_affinity'
|
||||
, 'mcsm_ppi2_scaled'
|
||||
, 'mcsm_ppi2_outcome'
|
||||
, 'interface_dist'
|
||||
, 'wild_type'
|
||||
, 'position'
|
||||
, 'mutant_type'
|
||||
, 'wt_upper'
|
||||
, 'mut_upper'
|
||||
, 'chain']]
|
||||
|
||||
#---------------------
|
||||
# Determine col order
|
||||
#---------------------
|
||||
|
||||
core_cols = ['mutationinformation'
|
||||
, 'mcsm_ppi2_affinity'
|
||||
, 'mcsm_ppi2_scaled'
|
||||
, 'mcsm_ppi2_outcome'
|
||||
, 'interface_dist'
|
||||
, 'wild_type'
|
||||
, 'position'
|
||||
, 'mutant_type'
|
||||
, 'wt_upper'
|
||||
, 'mut_upper'
|
||||
, 'chain']
|
||||
|
||||
if gene_name.lower() in geneL_sp:
|
||||
|
||||
column_order = core_cols + ['5uhc_offset', '5uhc_position']
|
||||
|
||||
else:
|
||||
|
||||
column_order = core_cols.copy()
|
||||
|
||||
#--------------
|
||||
# reorder now
|
||||
#--------------
|
||||
mcsm_ppi2_dataf = mcsm_ppi2_data[column_order]
|
||||
|
||||
#%%============================================================================
|
||||
###################
|
||||
# Sort df based on
|
||||
# position columns
|
||||
###################
|
||||
mcsm_ppi2_dataf.sort_values(by = ['position', 'mutant_type'], inplace = True, ascending = True)
|
||||
|
||||
return(mcsm_ppi2_dataf)
|
||||
#%%#####################################################################
|
|
@ -67,7 +67,7 @@ outfile_mcsm_ppi2_f = outdir_ppi2 + gene.lower() + '_complex_mcsm_ppi2_norm.csv'
|
|||
# Data: gid+streptomycin
|
||||
#==========================
|
||||
print('Formatting results for:', infile_mcsm_ppi2)
|
||||
mcsm_ppi2_df_f = format_mcsm_ppi2_output(mcsm_ppi2_output_csv = infile_mcsm_ppi2)
|
||||
mcsm_ppi2_df_f = format_mcsm_ppi2_output(mcsm_ppi2_output_csv = infile_mcsm_ppi2, gene_name = gene)
|
||||
|
||||
# writing file
|
||||
print('Writing formatted df to csv')
|
||||
|
|
|
@ -53,7 +53,7 @@ homedir = os.path.expanduser('~')
|
|||
|
||||
# set working dir
|
||||
os.getcwd()
|
||||
#os.chdir(homedir + '/git/LSHTM_analysis/scripts')
|
||||
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
|
||||
os.getcwd()
|
||||
|
||||
# FIXME: local imports
|
||||
|
@ -170,6 +170,18 @@ infilename_mcsm_f_snps = gene.lower() + '_mcsm_formatted_snps.csv'
|
|||
infile_mcsm_f_snps = outdir + infilename_mcsm_f_snps
|
||||
mcsm_f_snps = pd.read_csv(infile_mcsm_f_snps, sep = ',', names = ['mutationinformation'], header = None)
|
||||
|
||||
# more output added
|
||||
## consurf [change colnames]
|
||||
|
||||
infilename_consurf = gene.lower() + '_consurf_grades_f.csv'
|
||||
infile_consurf = outdir + 'consurf/'+ infilename_consurf
|
||||
consurf_df = pd.read_csv(infile_consurf, sep = ',')
|
||||
|
||||
## SNAP2 [add normalised score]
|
||||
infilename_snap2 = gene.lower() + '_snap2_output.csv'
|
||||
infile_snap2 = outdir + 'snap2/'+ infilename_snap2
|
||||
snap2_df = pd.read_csv(infile_snap2, sep = ',')
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
# ONLY:for gene pnca and gid: End logic should pick this up!
|
||||
geneL_na = ['gid', 'rpob']
|
||||
|
@ -196,7 +208,7 @@ if gene.lower() in geneL_dy:
|
|||
# infile_mcsm_na = outdir + 'mcsm_na_results/' + infilename_mcsm_na
|
||||
# mcsm_na_df = pd.read_csv(infile_mcsm_na, sep = ',')
|
||||
|
||||
# ONLY:for gene embb and alr: End logic should pick this up!
|
||||
# ONLY:for gene embb and alr and katg: End logic should pick this up!
|
||||
geneL_ppi2 = ['embb', 'alr']
|
||||
#if gene.lower() == "embb" or "alr":
|
||||
if gene.lower() in geneL_ppi2:
|
||||
|
@ -382,6 +394,247 @@ if deepddg_df['deepddg_scaled'].min() == -1 and deepddg_df['deepddg_scaled'].max
|
|||
print('\nPASS: Deepddg data is scaled between -1 and 1',
|
||||
'\nproceeding with merge')
|
||||
|
||||
#=======================
|
||||
# Consurf
|
||||
#=======================
|
||||
consurf_df.shape
|
||||
|
||||
# drop row 0: as it contains no value but hangover text
|
||||
consurf_df = consurf_df.drop(index=0)
|
||||
|
||||
#----------------------
|
||||
# rename colums
|
||||
#----------------------
|
||||
consurf_df.columns
|
||||
print('\nRenaming cols and assigning pretty column names')
|
||||
|
||||
geneL_consurf = ['alr', 'katg', 'rpob']
|
||||
|
||||
if gene.lower() in geneL_consurf:
|
||||
consurf_df = consurf_df.rename(columns={'POS' : 'position_consurf'})
|
||||
#---------------------------
|
||||
# Specify the offset
|
||||
#---------------------------
|
||||
print('\nAdding offset value for gene:', gene.lower())
|
||||
|
||||
if gene.lower() == 'alr':
|
||||
offset_val = 34
|
||||
|
||||
print('\nUsing offset val:', offset_val)
|
||||
if gene.lower() == 'katg':
|
||||
offset_val = 23
|
||||
print('\nUsing offset val:', offset_val)
|
||||
|
||||
if gene.lower() == 'rpob':
|
||||
offset_val = 28
|
||||
print('\nUsing offset val:', offset_val)
|
||||
|
||||
consurf_df['position'] = consurf_df['position_consurf'] + offset_val
|
||||
|
||||
else:
|
||||
consurf_df = consurf_df.rename(columns={'POS' : 'position'})
|
||||
|
||||
consurf_df = consurf_df.rename(columns={'SEQ' : 'wild_type'
|
||||
, '3LATOM': 'wt_3upper'
|
||||
, 'SCORE' : 'consurf_score'
|
||||
, 'COLOR' : 'consurf_colour_str'
|
||||
, 'CONFIDENCEINTERVAL' : 'consurf_ci'
|
||||
, 'CONFIDENCEINTERVALCOLORS' : 'consurf_ci_colour'
|
||||
, 'MSADATA' : 'consurf_msa_data'
|
||||
, 'RESIDUEVARIETY' : 'consurf_aa_variety'})
|
||||
# quick check
|
||||
if len(consurf_df) == len(rd_df):
|
||||
print('\nPASS: length of consurf df is as expected'
|
||||
, '\nProceeding to format consurf df')
|
||||
else:
|
||||
print('\nFAIL: length mismatch'
|
||||
, '\nExpected nrows:', len(rd_df)
|
||||
, '\nGot:', len(consurf_df))
|
||||
|
||||
consurf_df.dtypes
|
||||
consurf_df['consurf_score'] = consurf_df['consurf_score'].astype(float)
|
||||
|
||||
consurf_df['consurf_colour'] = consurf_df['consurf_colour_str'].str.extract(r'(\d).*')
|
||||
consurf_df['consurf_colour'] = consurf_df['consurf_colour'].astype(int)
|
||||
|
||||
consurf_df['consurf_colour_rev'] = consurf_df['consurf_colour_str'].str.replace(r'.\*','0')
|
||||
consurf_df['consurf_colour_rev'] = consurf_df['consurf_colour_rev'].astype(int)
|
||||
|
||||
consurf_df['consurf_ci_upper'] = consurf_df['consurf_ci'].str.extract(r'(.*):')
|
||||
consurf_df['consurf_ci_upper'] = consurf_df['consurf_ci_upper'].astype(float)
|
||||
|
||||
consurf_df['consurf_ci_lower'] = consurf_df['consurf_ci'].str.extract(r':(.*)')
|
||||
consurf_df['consurf_ci_lower'] = consurf_df['consurf_ci_lower'].astype(float)
|
||||
|
||||
#consurf_df['wt_3upper_f'] = consurf_df['wt_3upper'].str.extract(r'^\w{3}(\d+.*)')
|
||||
#consurf_df['wt_3upper_f']
|
||||
consurf_df['wt_3upper'] = consurf_df['wt_3upper'].str.replace(r'(\d+:.*)', '')
|
||||
|
||||
consurf_df['chain'] = consurf_df['wt_3upper'].str.extract(r':(.*)')
|
||||
|
||||
#-------------------------
|
||||
# scale consurf values
|
||||
#-------------------------
|
||||
# Rescale values in consurf_score col b/w -1 and 1 so negative numbers
|
||||
# stay neg and pos numbers stay positive
|
||||
consurf_min = consurf_df['consurf_score'].min()
|
||||
consurf_max = consurf_df['consurf_score'].max()
|
||||
consurf_min
|
||||
consurf_max
|
||||
|
||||
# quick check
|
||||
len(consurf_df.loc[consurf_df['consurf_score'] >= 0])
|
||||
len(consurf_df.loc[consurf_df['consurf_score'] < 0])
|
||||
|
||||
consurf_scale = lambda x : x/abs(consurf_min) if x < 0 else (x/consurf_max if x >= 0 else 'failed')
|
||||
|
||||
consurf_df['consurf_scaled'] = consurf_df['consurf_score'].apply(consurf_scale)
|
||||
print('\nRaw consurf scores:\n', consurf_df['consurf_score']
|
||||
, '\n---------------------------------------------------------------'
|
||||
, '\nScaled consurf scores:\n', consurf_df['consurf_scaled'])
|
||||
|
||||
# additional check added
|
||||
csmi = consurf_df['consurf_scaled'].min()
|
||||
csma = consurf_df['consurf_scaled'].max()
|
||||
|
||||
c = consurf_df[consurf_df['consurf_score']>=0].count()
|
||||
consurf_pos = c.get(key = 'consurf_score')
|
||||
|
||||
c2 = consurf_df[consurf_df['consurf_scaled']>=0].count()
|
||||
consurf_pos2 = c2.get(key = 'consurf_scaled')
|
||||
|
||||
if consurf_pos == consurf_pos2 and csmi == -1 and csma == 1:
|
||||
print('\nPASS: Consurf values scaled correctly b/w -1 and 1')
|
||||
else:
|
||||
print('\nFAIL: Consurf values scaled numbers MISmatch'
|
||||
, '\nExpected number:', consurf_pos
|
||||
, '\nGot:', consurf_pos2
|
||||
, '\n======================================================')
|
||||
|
||||
consurf_df.dtypes
|
||||
consurf_df.columns
|
||||
|
||||
#---------------------------
|
||||
# select columns
|
||||
# (and also determine order)
|
||||
#---------------------------
|
||||
consurf_df_f = consurf_df[['position'
|
||||
, 'wild_type'
|
||||
, 'chain'
|
||||
, 'wt_3upper'
|
||||
, 'consurf_score'
|
||||
, 'consurf_scaled'
|
||||
, 'consurf_colour'
|
||||
, 'consurf_colour_rev'
|
||||
, 'consurf_ci_upper'
|
||||
, 'consurf_ci_lower'
|
||||
, 'consurf_ci_colour'
|
||||
, 'consurf_msa_data'
|
||||
, 'consurf_aa_variety']]
|
||||
|
||||
#=======================
|
||||
# SNAP2
|
||||
#=======================
|
||||
snap2_df.shape
|
||||
|
||||
#----------------------
|
||||
# rename colums
|
||||
#----------------------
|
||||
geneL_snap2 = ['alr', 'katg', 'rpob']
|
||||
|
||||
if gene.lower() in geneL_snap2:
|
||||
print('\nReading SNAP2 for gene:', gene.lower()
|
||||
, '\nOffset column also being read'
|
||||
, '\nRenaming columns...'
|
||||
, '\nColumn mutationinformation exists. Renaming SNAP2 column variant --> mutationinformation')
|
||||
|
||||
snap2_df = snap2_df.rename(columns = {'mutationinformation': 'mutationinformation'
|
||||
, 'Variant' : 'mutationinformation_snap2'
|
||||
, 'Predicted Effect' : 'snap2_outcome'
|
||||
, 'Score' : 'snap2_score'
|
||||
, 'Expected Accuracy': 'snap2_accuracy_pc'})
|
||||
else:
|
||||
print('\nReading SNAP2 for gene:', gene.lower()
|
||||
, '\nNo offset column for SNAP2'
|
||||
, '\nRenaming columns...'
|
||||
, '\nRenaming SNAP2 column variant --> mutationinformation')
|
||||
|
||||
snap2_df = snap2_df.rename(columns = {'Variant' : 'mutationinformation'
|
||||
, 'Predicted Effect' : 'snap2_outcome'
|
||||
, 'Score' : 'snap2_score'
|
||||
, 'Expected Accuracy': 'snap2_accuracy_pc'})
|
||||
|
||||
snap2_df.columns
|
||||
snap2_df.head()
|
||||
snap2_df.dtypes
|
||||
|
||||
snap2_df['snap2_accuracy_pc'] = snap2_df['snap2_accuracy_pc'].str.replace('%','')
|
||||
snap2_df['snap2_accuracy_pc'] = snap2_df['snap2_accuracy_pc'].astype(int)
|
||||
|
||||
#-------------------------
|
||||
# scale snap2 values
|
||||
#-------------------------
|
||||
# Rescale values in snap2_score col b/w -1 and 1 so negative numbers
|
||||
# stay neg and pos numbers stay positive
|
||||
snap2_min = snap2_df['snap2_score'].min()
|
||||
snap2_max = snap2_df['snap2_score'].max()
|
||||
snap2_min
|
||||
snap2_max
|
||||
|
||||
# quick check
|
||||
len(snap2_df.loc[snap2_df['snap2_score'] >= 0])
|
||||
len(snap2_df.loc[snap2_df['snap2_score'] < 0])
|
||||
|
||||
snap2_scale = lambda x : x/abs(snap2_min) if x < 0 else (x/snap2_max if x >= 0 else 'failed')
|
||||
|
||||
snap2_df['snap2_scaled'] = snap2_df['snap2_score'].apply(snap2_scale)
|
||||
print('\nRaw snap2 scores:\n', snap2_df['snap2_score']
|
||||
, '\n---------------------------------------------------------------'
|
||||
, '\nScaled snap2 scores:\n', snap2_df['snap2_scaled'])
|
||||
|
||||
# additional check added
|
||||
ssmi = snap2_df['snap2_scaled'].min()
|
||||
ssma = snap2_df['snap2_scaled'].max()
|
||||
|
||||
sn = snap2_df[snap2_df['snap2_score']>=0].count()
|
||||
snap2_pos = sn.get(key = 'snap2_score')
|
||||
|
||||
sn2 = snap2_df[snap2_df['snap2_scaled']>=0].count()
|
||||
snap2_pos2 = sn2.get(key = 'snap2_scaled')
|
||||
|
||||
if snap2_pos == snap2_pos2 and csmi == -1 and csma == 1:
|
||||
print('\nPASS: Snap2 values scaled correctly b/w -1 and 1')
|
||||
else:
|
||||
print('\nFAIL: snap2 values scaled numbers MISmatch'
|
||||
, '\nExpected number:', snap2_pos
|
||||
, '\nGot:', snap2_pos2
|
||||
, '\n======================================================')
|
||||
|
||||
#---------------------------
|
||||
# select columns
|
||||
# (and also determine order)
|
||||
#---------------------------
|
||||
snap2_df.dtypes
|
||||
snap2_df.columns
|
||||
|
||||
geneL_snap2 = ['alr', 'katg', 'rpob']
|
||||
|
||||
if gene.lower() in geneL_snap2:
|
||||
print('\nSelecting cols SNAP2 for gene:', gene.lower())
|
||||
snap2_df_f = snap2_df[['mutationinformation'
|
||||
, 'mutationinformation_snap2'
|
||||
, 'snap2_score'
|
||||
, 'snap2_scaled'
|
||||
, 'snap2_accuracy_pc'
|
||||
, 'snap2_outcome']]
|
||||
else:
|
||||
print('\nSelecting cols SNAP2 for gene:', gene.lower())
|
||||
snap2_df_f = snap2_df[['mutationinformation'
|
||||
, 'snap2_score'
|
||||
, 'snap2_scaled'
|
||||
, 'snap2_accuracy_pc'
|
||||
, 'snap2_outcome']]
|
||||
|
||||
#%%============================================================================
|
||||
# Now merges begin
|
||||
|
||||
|
@ -499,7 +752,9 @@ merging_cols_m2 = detect_common_cols(dssp_df, kd_df)
|
|||
dssp_kd_dfs = pd.merge(dssp_df
|
||||
, kd_df
|
||||
, on = merging_cols_m2
|
||||
, how = "outer")
|
||||
#, how = "outer")
|
||||
, how = "inner")
|
||||
|
||||
|
||||
print('\n\nResult of third merge:', dssp_kd_dfs.shape
|
||||
, '\n===================================================================')
|
||||
|
@ -521,6 +776,26 @@ print('\n\nResult of Third merge:', dssp_kd_rd_dfs.shape
|
|||
, '\n===================================================================')
|
||||
dssp_kd_rd_dfs[merging_cols_m3].apply(len)
|
||||
dssp_kd_rd_dfs[merging_cols_m3].apply(len) == len(dssp_kd_rd_dfs)
|
||||
|
||||
#%%============================================================================
|
||||
print('==================================='
|
||||
, '\nFourth merge*: fourth merge + consurf_df'
|
||||
, '\dssp_kd_rd_dfs + consurf_df'
|
||||
, '\n===================================')
|
||||
#dssp_kd_rd_dfs = combine_dfs_with_checks(dssp_kd_dfs, rd_df, my_join = "outer")
|
||||
merging_cols_m3_v2 = detect_common_cols(dssp_kd_rd_dfs, consurf_df)
|
||||
dssp_kd_rd_con_dfs = pd.merge(dssp_kd_rd_dfs
|
||||
, consurf_df
|
||||
, on = merging_cols_m3_v2
|
||||
, how = "outer")
|
||||
|
||||
ncols_m3_v2 = len(dssp_kd_rd_con_dfs.columns)
|
||||
|
||||
print('\n\nResult of fourth merge*:', dssp_kd_rd_con_dfs.shape
|
||||
, '\n===================================================================')
|
||||
dssp_kd_rd_con_dfs[merging_cols_m3_v2].apply(len)
|
||||
dssp_kd_rd_con_dfs[merging_cols_m3_v2].apply(len) == len(dssp_kd_rd_con_dfs)
|
||||
|
||||
#%%============================================================================
|
||||
print('======================================='
|
||||
, '\nFifth merge: Second merge + fourth merge'
|
||||
|
|
|
@ -75,15 +75,14 @@ args = arg_parser.parse_args()
|
|||
drug = args.drug
|
||||
gene = args.gene
|
||||
|
||||
#drug = 'pyrazinamide'
|
||||
#gene = 'pncA'
|
||||
|
||||
gene_match = gene + '_p.'
|
||||
print('mut pattern for gene', gene, ':', gene_match)
|
||||
|
||||
nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}'
|
||||
print('nsSNP for gene', gene, ':', nssnp_match)
|
||||
|
||||
nssnp_match2 = re.compile(nssnp_match)
|
||||
|
||||
wt_regex = gene_match.lower()+'([A-Za-z]{3})'
|
||||
print('wt regex:', wt_regex)
|
||||
|
||||
|
@ -219,20 +218,21 @@ meta_gene_epi = meta_gene_multi.loc[(meta_gene_multi['dr_mult_snp_count']>1) | (
|
|||
|
||||
#%% TEST
|
||||
# formatting, replace !nssnp_match with nothing
|
||||
foo1 = 'pncA_p.Thr47Pro;pncA_p.Thr61Pro;rpsA_c.XX'
|
||||
foo2 = 'pncA_Chromosome:g.2288693_2289280del; WT; pncA_p.Thr61Ala'
|
||||
#foo1 = 'pncA_p.Thr47Pro;pncA_p.Thr61Pro;rpsA_c.XX'
|
||||
#foo2 = 'pncA_Chromosome:g.2288693_2289280del; WT; pncA_p.Thr61Ala'
|
||||
|
||||
|
||||
foo1_s = foo1.split(';')
|
||||
foo1_s
|
||||
nssnp_match2 = re.compile('(pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})')
|
||||
arse=list(filter(nssnp_match2.match, foo1_s))
|
||||
arse
|
||||
#foo1_s = foo1.split(';')
|
||||
#foo1_s
|
||||
#nssnp_match2 = re.compile('(pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})')
|
||||
#arse=list(filter(nssnp_match2.match, foo1_s))
|
||||
#arse
|
||||
|
||||
#foo1_s2 = ';'.join(arse)
|
||||
#foo1_s2
|
||||
|
||||
foo1_s2 = ';'.join(arse)
|
||||
foo1_s2
|
||||
#%%
|
||||
nssnp_match2 = re.compile('(pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})')
|
||||
#nssnp_match2 = re.compile('(pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})')
|
||||
|
||||
# dr_muts_col
|
||||
dr_clean_col = dr_muts_col + '_clean'
|
||||
|
@ -248,6 +248,7 @@ for i, v in enumerate(meta_gene_epi[dr_muts_col]):
|
|||
dr2_s = v.split(';')
|
||||
print(dr2_s)
|
||||
dr2_sf = list(filter(nssnp_match2.match, dr2_s))
|
||||
#dr2_sf = list(filter(nssnp_match.match, dr2_s))
|
||||
print(dr2_sf)
|
||||
dr2_sf2 = ';'.join(dr2_sf)
|
||||
meta_gene_epi[dr_clean_col].iloc[i] = dr2_sf2
|
||||
|
@ -262,13 +263,13 @@ meta_gene_epi[other_clean_col] = ''
|
|||
|
||||
for i, v in enumerate(meta_gene_epi[other_muts_col]):
|
||||
#print(i, v)
|
||||
print('======================================================')
|
||||
print(i)
|
||||
print(v)
|
||||
#print('======================================================')
|
||||
#print(i)
|
||||
#print(v)
|
||||
other2_s = v.split(';')
|
||||
print(other2_s)
|
||||
#print(other2_s)
|
||||
other2_sf = list(filter(nssnp_match2.match, other2_s))
|
||||
print(other2_sf)
|
||||
#print(other2_sf)
|
||||
other2_sf2 = ';'.join(other2_sf)
|
||||
meta_gene_epi[other_clean_col].iloc[i] = other2_sf2
|
||||
|
||||
|
@ -281,7 +282,8 @@ meta_gene_epi_f = meta_gene_epi[['id', 'sample'
|
|||
, 'dr_mult_snp_count'
|
||||
, other_muts_col, other_clean_col
|
||||
, 'other_mult_snp_count']]
|
||||
meta_gene_epi_f.columns
|
||||
#print(meta_gene_epi_f.columns)
|
||||
print(meta_gene_epi_f)
|
||||
|
||||
cols_to_output = ['id', 'sample'
|
||||
, dr_clean_col
|
||||
|
@ -293,7 +295,6 @@ cols_to_output = ['id', 'sample'
|
|||
meta_gene_epi_f2 = meta_gene_epi_f[cols_to_output]
|
||||
|
||||
|
||||
|
||||
#%%
|
||||
# formatting, replace !nssnp_match with nothing
|
||||
#nssnp_neg_match = '(?!pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})'
|
||||
|
|
0
scripts/plotting/basic_barplots_combined.R
Executable file → Normal file
0
scripts/plotting/basic_barplots_combined.R
Executable file → Normal file
0
scripts/plotting/corr_adjusted_PS_LIG.R
Executable file → Normal file
0
scripts/plotting/corr_adjusted_PS_LIG.R
Executable file → Normal file
0
scripts/plotting/dirs.R
Executable file → Normal file
0
scripts/plotting/dirs.R
Executable file → Normal file
0
scripts/plotting/dist_plots_check.R
Executable file → Normal file
0
scripts/plotting/dist_plots_check.R
Executable file → Normal file
0
scripts/plotting/extreme_muts.R
Executable file → Normal file
0
scripts/plotting/extreme_muts.R
Executable file → Normal file
0
scripts/plotting/get_plotting_dfs.R
Executable file → Normal file
0
scripts/plotting/get_plotting_dfs.R
Executable file → Normal file
0
scripts/plotting/ggcorr_all_PS_LIG.R
Executable file → Normal file
0
scripts/plotting/ggcorr_all_PS_LIG.R
Executable file → Normal file
0
scripts/plotting/hist_af_or_base.R
Executable file → Normal file
0
scripts/plotting/hist_af_or_base.R
Executable file → Normal file
0
scripts/plotting/hist_af_or_combined.R
Executable file → Normal file
0
scripts/plotting/hist_af_or_combined.R
Executable file → Normal file
0
scripts/plotting/legend_adjustment.R
Executable file → Normal file
0
scripts/plotting/legend_adjustment.R
Executable file → Normal file
0
scripts/plotting/opp_mcsm_muts.R
Executable file → Normal file
0
scripts/plotting/opp_mcsm_muts.R
Executable file → Normal file
0
scripts/plotting/or_plots_combined.R
Executable file → Normal file
0
scripts/plotting/or_plots_combined.R
Executable file → Normal file
0
scripts/plotting/other_plots_combined.R
Executable file → Normal file
0
scripts/plotting/other_plots_combined.R
Executable file → Normal file
0
scripts/plotting/output_tables.R
Executable file → Normal file
0
scripts/plotting/output_tables.R
Executable file → Normal file
0
scripts/plotting/ps_plots_combined.R
Executable file → Normal file
0
scripts/plotting/ps_plots_combined.R
Executable file → Normal file
0
scripts/plotting/resolving_ambiguous_muts.R
Executable file → Normal file
0
scripts/plotting/resolving_ambiguous_muts.R
Executable file → Normal file
Loading…
Add table
Add a link
Reference in a new issue