From 00b84ccb1c22a27c800bf86448a219fefcb92eda Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Tue, 4 Jan 2022 10:45:29 +0000 Subject: [PATCH] handled rpob 5uhc position offset in mcsm_ppi2 --- dynamut/format_results_dynamut.py | 0 dynamut/format_results_dynamut2.py | 0 dynamut/run_format_results_dynamut.py | 0 dynamut/split_csv_chain.sh | 6 +- mcsm_na/examples.py | 0 mcsm_na/format_results_mcsm_na.py | 0 mcsm_na/get_results_mcsm_na.py | 0 mcsm_na/run_format_results_mcsm_na.py | 0 mcsm_na/submit_mcsm_na.py | 0 mcsm_ppi2/format_results_mcsm_ppi2.py | 126 ++++++--- mcsm_ppi2/run_format_results_mcsm_ppi2.py | 2 +- scripts/combining_dfs.py | 281 +++++++++++++++++++- scripts/data_extraction_epistasis.py | 41 +-- scripts/kd_df.py | 2 +- scripts/plotting/basic_barplots_combined.R | 0 scripts/plotting/corr_adjusted_PS_LIG.R | 0 scripts/plotting/dirs.R | 0 scripts/plotting/dist_plots_check.R | 0 scripts/plotting/extreme_muts.R | 0 scripts/plotting/get_plotting_dfs.R | 0 scripts/plotting/ggcorr_all_PS_LIG.R | 0 scripts/plotting/hist_af_or_base.R | 0 scripts/plotting/hist_af_or_combined.R | 0 scripts/plotting/legend_adjustment.R | 0 scripts/plotting/opp_mcsm_muts.R | 0 scripts/plotting/or_plots_combined.R | 0 scripts/plotting/other_plots_combined.R | 0 scripts/plotting/output_tables.R | 0 scripts/plotting/ps_plots_combined.R | 0 scripts/plotting/resolving_ambiguous_muts.R | 0 30 files changed, 395 insertions(+), 63 deletions(-) mode change 100755 => 100644 dynamut/format_results_dynamut.py mode change 100755 => 100644 dynamut/format_results_dynamut2.py mode change 100755 => 100644 dynamut/run_format_results_dynamut.py mode change 100755 => 100644 mcsm_na/examples.py mode change 100755 => 100644 mcsm_na/format_results_mcsm_na.py mode change 100755 => 100644 mcsm_na/get_results_mcsm_na.py mode change 100755 => 100644 mcsm_na/run_format_results_mcsm_na.py mode change 100755 => 100644 mcsm_na/submit_mcsm_na.py mode change 100755 => 100644 scripts/plotting/basic_barplots_combined.R mode change 100755 => 100644 scripts/plotting/corr_adjusted_PS_LIG.R mode change 100755 => 100644 scripts/plotting/dirs.R mode change 100755 => 100644 scripts/plotting/dist_plots_check.R mode change 100755 => 100644 scripts/plotting/extreme_muts.R mode change 100755 => 100644 scripts/plotting/get_plotting_dfs.R mode change 100755 => 100644 scripts/plotting/ggcorr_all_PS_LIG.R mode change 100755 => 100644 scripts/plotting/hist_af_or_base.R mode change 100755 => 100644 scripts/plotting/hist_af_or_combined.R mode change 100755 => 100644 scripts/plotting/legend_adjustment.R mode change 100755 => 100644 scripts/plotting/opp_mcsm_muts.R mode change 100755 => 100644 scripts/plotting/or_plots_combined.R mode change 100755 => 100644 scripts/plotting/other_plots_combined.R mode change 100755 => 100644 scripts/plotting/output_tables.R mode change 100755 => 100644 scripts/plotting/ps_plots_combined.R mode change 100755 => 100644 scripts/plotting/resolving_ambiguous_muts.R diff --git a/dynamut/format_results_dynamut.py b/dynamut/format_results_dynamut.py old mode 100755 new mode 100644 diff --git a/dynamut/format_results_dynamut2.py b/dynamut/format_results_dynamut2.py old mode 100755 new mode 100644 diff --git a/dynamut/run_format_results_dynamut.py b/dynamut/run_format_results_dynamut.py old mode 100755 new mode 100644 diff --git a/dynamut/split_csv_chain.sh b/dynamut/split_csv_chain.sh index ac60faa..0512c1f 100755 --- a/dynamut/split_csv_chain.sh +++ b/dynamut/split_csv_chain.sh @@ -31,7 +31,11 @@ split ../../../${INFILE} -l ${CHUNK} -d snp_batch_ #~/git/LSHTM_analysis/dynamut/split_csv_chain.sh alr_mcsm_formatted_snps_chain.csv snp_batches 50 # Date: 05/10/2021 -#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh alr_mcsm_formatted_snps_chain.csv snp_batches 20 +#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh alr_mcsm_formatted_snps_chain.csv snp_batches 20 + +# Date: 30/11/2021 +#~/git/LSHTM_analysis/dynamut/split_csv_chain.sh katg_mcsm_formatted_snps_chain.csv snp_batches 20 +for i in {00..40}; do mv snp_batch_${i} snp_batch_${i}.txt; done # add .txt to the files ######################################################################## diff --git a/mcsm_na/examples.py b/mcsm_na/examples.py old mode 100755 new mode 100644 diff --git a/mcsm_na/format_results_mcsm_na.py b/mcsm_na/format_results_mcsm_na.py old mode 100755 new mode 100644 diff --git a/mcsm_na/get_results_mcsm_na.py b/mcsm_na/get_results_mcsm_na.py old mode 100755 new mode 100644 diff --git a/mcsm_na/run_format_results_mcsm_na.py b/mcsm_na/run_format_results_mcsm_na.py old mode 100755 new mode 100644 diff --git a/mcsm_na/submit_mcsm_na.py b/mcsm_na/submit_mcsm_na.py old mode 100755 new mode 100644 diff --git a/mcsm_ppi2/format_results_mcsm_ppi2.py b/mcsm_ppi2/format_results_mcsm_ppi2.py index 49c05b2..656e47b 100755 --- a/mcsm_ppi2/format_results_mcsm_ppi2.py +++ b/mcsm_ppi2/format_results_mcsm_ppi2.py @@ -24,7 +24,7 @@ from reference_dict import up_3letter_aa_dict from reference_dict import oneletter_aa_dict #%%============================================================================ -def format_mcsm_ppi2_output(mcsm_ppi2_output_csv): +def format_mcsm_ppi2_output(mcsm_ppi2_output_csv, gene_name): """ @param mcsm_ppi2_output_csv: file containing mcsm_ppi2_results for all mcsm snps which is the result of combining all mcsm_ppi2 batch results, and using @@ -78,30 +78,57 @@ def format_mcsm_ppi2_output(mcsm_ppi2_output_csv): # # check # mcsm_ppi2_data['wild-type'].equals(mcsm_ppi2_data['WILD']) - # mcsm_ppi2_data['mutant'].equals(mcsm_ppi2_data['MUT']) -#%%============================================================================ - ############# - # rename cols - ############# - # format colnames: all lowercase and consistent colnames - mcsm_ppi2_data.columns - print('Assigning meaningful colnames' - , '\n=======================================================') - - my_colnames_dict = {'chain': 'chain' - , 'wild-type': 'wt_upper' - , 'res-number': 'position' - , 'mutant': 'mut_upper' - , 'distance-to-interface': 'interface_dist' - , 'mcsm-ppi2-prediction': 'mcsm_ppi2_affinity' - , 'affinity': 'mcsm_ppi2_outcome' - , 'w_type': 'wild_type' # one letter amino acid code - , 'm_type': 'mutant_type' # one letter amino acid code -} + # mcsm_ppi2_data['mutant'].equals(mcsm_ppi2_data['MUT']) +#%%===================================================================== +# add offset specified position number for rpob since 5uhc with chain 'C' was +# used to run the analysis + geneL_sp = ['rpob'] + if gene_name.lower() in geneL_sp: + offset = 6 + chain_orig = 'A' + + # Add offset corrected position number. matching with rpob nsSNPs used for mCSM-lig + # and also add corresponding chain id matching with rpob nsSNPs used for mCSM-lig + mcsm_ppi2_data['position'] = mcsm_ppi2_data['res-number'] - offset + mcsm_ppi2_data['chain'] = chain_orig + mcsm_ppi2_data['5uhc_offset'] = offset + + ############# + # rename cols + ############# + # format colnames: all lowercase and consistent colnames + mcsm_ppi2_data.columns + print('Assigning meaningful colnames' + , '\n=======================================================') + + my_colnames_dict = {'chain' : 'chain' + , 'position' : 'position' + , '5uhc_offset' : '5uhc_offset' + , 'wild-type' : 'wt_upper' + , 'res-number' : '5uhc_position' + , 'mutant' : 'mut_upper' + , 'distance-to-interface': 'interface_dist' + , 'mcsm-ppi2-prediction' : 'mcsm_ppi2_affinity' + , 'affinity' : 'mcsm_ppi2_outcome' + , 'w_type' : 'wild_type' # one letter amino acid code + , 'm_type' : 'mutant_type' # one letter amino acid code + } + else: + my_colnames_dict = {'chain' : 'chain' + , 'wild-type' : 'wt_upper' + , 'res-number' : 'position' + , 'mutant' : 'mut_upper' + , 'distance-to-interface': 'interface_dist' + , 'mcsm-ppi2-prediction' : 'mcsm_ppi2_affinity' + , 'affinity' : 'mcsm_ppi2_outcome' + , 'w_type' : 'wild_type' # one letter amino acid code + , 'm_type' : 'mutant_type' # one letter amino acid code + } +#%%============================================================================== mcsm_ppi2_data.rename(columns = my_colnames_dict, inplace = True) mcsm_ppi2_data.columns - + ############# # create mutationinformation column ############# @@ -137,22 +164,47 @@ def format_mcsm_ppi2_output(mcsm_ppi2_output_csv): , '\nExpected number:', mcsm_ppi2_pos , '\nGot:', mcsm_ppi2_pos2 , '\n======================================================') - #%%===================================================================== - ############# + ################### # reorder columns - ############# + ################### mcsm_ppi2_data.columns - mcsm_ppi2_dataf = mcsm_ppi2_data[['mutationinformation' - , 'mcsm_ppi2_affinity' - , 'mcsm_ppi2_scaled' - , 'mcsm_ppi2_outcome' - , 'interface_dist' - , 'wild_type' - , 'position' - , 'mutant_type' - , 'wt_upper' - , 'mut_upper' - , 'chain']] + + #--------------------- + # Determine col order + #--------------------- + + core_cols = ['mutationinformation' + , 'mcsm_ppi2_affinity' + , 'mcsm_ppi2_scaled' + , 'mcsm_ppi2_outcome' + , 'interface_dist' + , 'wild_type' + , 'position' + , 'mutant_type' + , 'wt_upper' + , 'mut_upper' + , 'chain'] + + if gene_name.lower() in geneL_sp: + + column_order = core_cols + ['5uhc_offset', '5uhc_position'] + + else: + + column_order = core_cols.copy() + + #-------------- + # reorder now + #-------------- + mcsm_ppi2_dataf = mcsm_ppi2_data[column_order] + +#%%============================================================================ + ################### + # Sort df based on + # position columns + ################### + mcsm_ppi2_dataf.sort_values(by = ['position', 'mutant_type'], inplace = True, ascending = True) + return(mcsm_ppi2_dataf) -#%%##################################################################### +#%%##################################################################### \ No newline at end of file diff --git a/mcsm_ppi2/run_format_results_mcsm_ppi2.py b/mcsm_ppi2/run_format_results_mcsm_ppi2.py index ab036f9..e0f3612 100755 --- a/mcsm_ppi2/run_format_results_mcsm_ppi2.py +++ b/mcsm_ppi2/run_format_results_mcsm_ppi2.py @@ -67,7 +67,7 @@ outfile_mcsm_ppi2_f = outdir_ppi2 + gene.lower() + '_complex_mcsm_ppi2_norm.csv' # Data: gid+streptomycin #========================== print('Formatting results for:', infile_mcsm_ppi2) -mcsm_ppi2_df_f = format_mcsm_ppi2_output(mcsm_ppi2_output_csv = infile_mcsm_ppi2) +mcsm_ppi2_df_f = format_mcsm_ppi2_output(mcsm_ppi2_output_csv = infile_mcsm_ppi2, gene_name = gene) # writing file print('Writing formatted df to csv') diff --git a/scripts/combining_dfs.py b/scripts/combining_dfs.py index d6cb2fd..b33d36f 100755 --- a/scripts/combining_dfs.py +++ b/scripts/combining_dfs.py @@ -53,7 +53,7 @@ homedir = os.path.expanduser('~') # set working dir os.getcwd() -#os.chdir(homedir + '/git/LSHTM_analysis/scripts') +os.chdir(homedir + '/git/LSHTM_analysis/scripts') os.getcwd() # FIXME: local imports @@ -170,6 +170,18 @@ infilename_mcsm_f_snps = gene.lower() + '_mcsm_formatted_snps.csv' infile_mcsm_f_snps = outdir + infilename_mcsm_f_snps mcsm_f_snps = pd.read_csv(infile_mcsm_f_snps, sep = ',', names = ['mutationinformation'], header = None) +# more output added +## consurf [change colnames] + +infilename_consurf = gene.lower() + '_consurf_grades_f.csv' +infile_consurf = outdir + 'consurf/'+ infilename_consurf +consurf_df = pd.read_csv(infile_consurf, sep = ',') + +## SNAP2 [add normalised score] +infilename_snap2 = gene.lower() + '_snap2_output.csv' +infile_snap2 = outdir + 'snap2/'+ infilename_snap2 +snap2_df = pd.read_csv(infile_snap2, sep = ',') + #------------------------------------------------------------------------------ # ONLY:for gene pnca and gid: End logic should pick this up! geneL_na = ['gid', 'rpob'] @@ -196,7 +208,7 @@ if gene.lower() in geneL_dy: # infile_mcsm_na = outdir + 'mcsm_na_results/' + infilename_mcsm_na # mcsm_na_df = pd.read_csv(infile_mcsm_na, sep = ',') -# ONLY:for gene embb and alr: End logic should pick this up! +# ONLY:for gene embb and alr and katg: End logic should pick this up! geneL_ppi2 = ['embb', 'alr'] #if gene.lower() == "embb" or "alr": if gene.lower() in geneL_ppi2: @@ -381,6 +393,247 @@ else: if deepddg_df['deepddg_scaled'].min() == -1 and deepddg_df['deepddg_scaled'].max() == 1: print('\nPASS: Deepddg data is scaled between -1 and 1', '\nproceeding with merge') + +#======================= +# Consurf +#======================= +consurf_df.shape + +# drop row 0: as it contains no value but hangover text +consurf_df = consurf_df.drop(index=0) + +#---------------------- +# rename colums +#---------------------- +consurf_df.columns +print('\nRenaming cols and assigning pretty column names') + +geneL_consurf = ['alr', 'katg', 'rpob'] + +if gene.lower() in geneL_consurf: + consurf_df = consurf_df.rename(columns={'POS' : 'position_consurf'}) + #--------------------------- + # Specify the offset + #--------------------------- + print('\nAdding offset value for gene:', gene.lower()) + + if gene.lower() == 'alr': + offset_val = 34 + + print('\nUsing offset val:', offset_val) + if gene.lower() == 'katg': + offset_val = 23 + print('\nUsing offset val:', offset_val) + + if gene.lower() == 'rpob': + offset_val = 28 + print('\nUsing offset val:', offset_val) + + consurf_df['position'] = consurf_df['position_consurf'] + offset_val + +else: + consurf_df = consurf_df.rename(columns={'POS' : 'position'}) + +consurf_df = consurf_df.rename(columns={'SEQ' : 'wild_type' + , '3LATOM': 'wt_3upper' + , 'SCORE' : 'consurf_score' + , 'COLOR' : 'consurf_colour_str' + , 'CONFIDENCEINTERVAL' : 'consurf_ci' + , 'CONFIDENCEINTERVALCOLORS' : 'consurf_ci_colour' + , 'MSADATA' : 'consurf_msa_data' + , 'RESIDUEVARIETY' : 'consurf_aa_variety'}) +# quick check +if len(consurf_df) == len(rd_df): + print('\nPASS: length of consurf df is as expected' + , '\nProceeding to format consurf df') +else: + print('\nFAIL: length mismatch' + , '\nExpected nrows:', len(rd_df) + , '\nGot:', len(consurf_df)) + +consurf_df.dtypes +consurf_df['consurf_score'] = consurf_df['consurf_score'].astype(float) + +consurf_df['consurf_colour'] = consurf_df['consurf_colour_str'].str.extract(r'(\d).*') +consurf_df['consurf_colour'] = consurf_df['consurf_colour'].astype(int) + +consurf_df['consurf_colour_rev'] = consurf_df['consurf_colour_str'].str.replace(r'.\*','0') +consurf_df['consurf_colour_rev'] = consurf_df['consurf_colour_rev'].astype(int) + +consurf_df['consurf_ci_upper'] = consurf_df['consurf_ci'].str.extract(r'(.*):') +consurf_df['consurf_ci_upper'] = consurf_df['consurf_ci_upper'].astype(float) + +consurf_df['consurf_ci_lower'] = consurf_df['consurf_ci'].str.extract(r':(.*)') +consurf_df['consurf_ci_lower'] = consurf_df['consurf_ci_lower'].astype(float) + +#consurf_df['wt_3upper_f'] = consurf_df['wt_3upper'].str.extract(r'^\w{3}(\d+.*)') +#consurf_df['wt_3upper_f'] +consurf_df['wt_3upper'] = consurf_df['wt_3upper'].str.replace(r'(\d+:.*)', '') + +consurf_df['chain'] = consurf_df['wt_3upper'].str.extract(r':(.*)') + +#------------------------- +# scale consurf values +#------------------------- +# Rescale values in consurf_score col b/w -1 and 1 so negative numbers +# stay neg and pos numbers stay positive +consurf_min = consurf_df['consurf_score'].min() +consurf_max = consurf_df['consurf_score'].max() +consurf_min +consurf_max + +# quick check +len(consurf_df.loc[consurf_df['consurf_score'] >= 0]) +len(consurf_df.loc[consurf_df['consurf_score'] < 0]) + +consurf_scale = lambda x : x/abs(consurf_min) if x < 0 else (x/consurf_max if x >= 0 else 'failed') + +consurf_df['consurf_scaled'] = consurf_df['consurf_score'].apply(consurf_scale) +print('\nRaw consurf scores:\n', consurf_df['consurf_score'] + , '\n---------------------------------------------------------------' + , '\nScaled consurf scores:\n', consurf_df['consurf_scaled']) + +# additional check added +csmi = consurf_df['consurf_scaled'].min() +csma = consurf_df['consurf_scaled'].max() + +c = consurf_df[consurf_df['consurf_score']>=0].count() +consurf_pos = c.get(key = 'consurf_score') + +c2 = consurf_df[consurf_df['consurf_scaled']>=0].count() +consurf_pos2 = c2.get(key = 'consurf_scaled') + +if consurf_pos == consurf_pos2 and csmi == -1 and csma == 1: + print('\nPASS: Consurf values scaled correctly b/w -1 and 1') +else: + print('\nFAIL: Consurf values scaled numbers MISmatch' + , '\nExpected number:', consurf_pos + , '\nGot:', consurf_pos2 + , '\n======================================================') + +consurf_df.dtypes +consurf_df.columns + +#--------------------------- +# select columns +# (and also determine order) +#--------------------------- +consurf_df_f = consurf_df[['position' + , 'wild_type' + , 'chain' + , 'wt_3upper' + , 'consurf_score' + , 'consurf_scaled' + , 'consurf_colour' + , 'consurf_colour_rev' + , 'consurf_ci_upper' + , 'consurf_ci_lower' + , 'consurf_ci_colour' + , 'consurf_msa_data' + , 'consurf_aa_variety']] + +#======================= +# SNAP2 +#======================= +snap2_df.shape + +#---------------------- +# rename colums +#---------------------- +geneL_snap2 = ['alr', 'katg', 'rpob'] + +if gene.lower() in geneL_snap2: + print('\nReading SNAP2 for gene:', gene.lower() + , '\nOffset column also being read' + , '\nRenaming columns...' + , '\nColumn mutationinformation exists. Renaming SNAP2 column variant --> mutationinformation') + + snap2_df = snap2_df.rename(columns = {'mutationinformation': 'mutationinformation' + , 'Variant' : 'mutationinformation_snap2' + , 'Predicted Effect' : 'snap2_outcome' + , 'Score' : 'snap2_score' + , 'Expected Accuracy': 'snap2_accuracy_pc'}) +else: + print('\nReading SNAP2 for gene:', gene.lower() + , '\nNo offset column for SNAP2' + , '\nRenaming columns...' + , '\nRenaming SNAP2 column variant --> mutationinformation') + + snap2_df = snap2_df.rename(columns = {'Variant' : 'mutationinformation' + , 'Predicted Effect' : 'snap2_outcome' + , 'Score' : 'snap2_score' + , 'Expected Accuracy': 'snap2_accuracy_pc'}) + +snap2_df.columns +snap2_df.head() +snap2_df.dtypes + +snap2_df['snap2_accuracy_pc'] = snap2_df['snap2_accuracy_pc'].str.replace('%','') +snap2_df['snap2_accuracy_pc'] = snap2_df['snap2_accuracy_pc'].astype(int) + +#------------------------- +# scale snap2 values +#------------------------- +# Rescale values in snap2_score col b/w -1 and 1 so negative numbers +# stay neg and pos numbers stay positive +snap2_min = snap2_df['snap2_score'].min() +snap2_max = snap2_df['snap2_score'].max() +snap2_min +snap2_max + +# quick check +len(snap2_df.loc[snap2_df['snap2_score'] >= 0]) +len(snap2_df.loc[snap2_df['snap2_score'] < 0]) + +snap2_scale = lambda x : x/abs(snap2_min) if x < 0 else (x/snap2_max if x >= 0 else 'failed') + +snap2_df['snap2_scaled'] = snap2_df['snap2_score'].apply(snap2_scale) +print('\nRaw snap2 scores:\n', snap2_df['snap2_score'] + , '\n---------------------------------------------------------------' + , '\nScaled snap2 scores:\n', snap2_df['snap2_scaled']) + +# additional check added +ssmi = snap2_df['snap2_scaled'].min() +ssma = snap2_df['snap2_scaled'].max() + +sn = snap2_df[snap2_df['snap2_score']>=0].count() +snap2_pos = sn.get(key = 'snap2_score') + +sn2 = snap2_df[snap2_df['snap2_scaled']>=0].count() +snap2_pos2 = sn2.get(key = 'snap2_scaled') + +if snap2_pos == snap2_pos2 and csmi == -1 and csma == 1: + print('\nPASS: Snap2 values scaled correctly b/w -1 and 1') +else: + print('\nFAIL: snap2 values scaled numbers MISmatch' + , '\nExpected number:', snap2_pos + , '\nGot:', snap2_pos2 + , '\n======================================================') + +#--------------------------- +# select columns +# (and also determine order) +#--------------------------- +snap2_df.dtypes +snap2_df.columns + +geneL_snap2 = ['alr', 'katg', 'rpob'] + +if gene.lower() in geneL_snap2: + print('\nSelecting cols SNAP2 for gene:', gene.lower()) + snap2_df_f = snap2_df[['mutationinformation' + , 'mutationinformation_snap2' + , 'snap2_score' + , 'snap2_scaled' + , 'snap2_accuracy_pc' + , 'snap2_outcome']] +else: + print('\nSelecting cols SNAP2 for gene:', gene.lower()) + snap2_df_f = snap2_df[['mutationinformation' + , 'snap2_score' + , 'snap2_scaled' + , 'snap2_accuracy_pc' + , 'snap2_outcome']] #%%============================================================================ # Now merges begin @@ -499,7 +752,9 @@ merging_cols_m2 = detect_common_cols(dssp_df, kd_df) dssp_kd_dfs = pd.merge(dssp_df , kd_df , on = merging_cols_m2 - , how = "outer") + #, how = "outer") + , how = "inner") + print('\n\nResult of third merge:', dssp_kd_dfs.shape , '\n===================================================================') @@ -521,6 +776,26 @@ print('\n\nResult of Third merge:', dssp_kd_rd_dfs.shape , '\n===================================================================') dssp_kd_rd_dfs[merging_cols_m3].apply(len) dssp_kd_rd_dfs[merging_cols_m3].apply(len) == len(dssp_kd_rd_dfs) + +#%%============================================================================ +print('===================================' + , '\nFourth merge*: fourth merge + consurf_df' + , '\dssp_kd_rd_dfs + consurf_df' + , '\n===================================') +#dssp_kd_rd_dfs = combine_dfs_with_checks(dssp_kd_dfs, rd_df, my_join = "outer") +merging_cols_m3_v2 = detect_common_cols(dssp_kd_rd_dfs, consurf_df) +dssp_kd_rd_con_dfs = pd.merge(dssp_kd_rd_dfs + , consurf_df + , on = merging_cols_m3_v2 + , how = "outer") + +ncols_m3_v2 = len(dssp_kd_rd_con_dfs.columns) + +print('\n\nResult of fourth merge*:', dssp_kd_rd_con_dfs.shape + , '\n===================================================================') +dssp_kd_rd_con_dfs[merging_cols_m3_v2].apply(len) +dssp_kd_rd_con_dfs[merging_cols_m3_v2].apply(len) == len(dssp_kd_rd_con_dfs) + #%%============================================================================ print('=======================================' , '\nFifth merge: Second merge + fourth merge' diff --git a/scripts/data_extraction_epistasis.py b/scripts/data_extraction_epistasis.py index 6626338..094a82b 100755 --- a/scripts/data_extraction_epistasis.py +++ b/scripts/data_extraction_epistasis.py @@ -75,15 +75,14 @@ args = arg_parser.parse_args() drug = args.drug gene = args.gene -#drug = 'pyrazinamide' -#gene = 'pncA' - gene_match = gene + '_p.' print('mut pattern for gene', gene, ':', gene_match) nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}' print('nsSNP for gene', gene, ':', nssnp_match) +nssnp_match2 = re.compile(nssnp_match) + wt_regex = gene_match.lower()+'([A-Za-z]{3})' print('wt regex:', wt_regex) @@ -219,20 +218,21 @@ meta_gene_epi = meta_gene_multi.loc[(meta_gene_multi['dr_mult_snp_count']>1) | ( #%% TEST # formatting, replace !nssnp_match with nothing -foo1 = 'pncA_p.Thr47Pro;pncA_p.Thr61Pro;rpsA_c.XX' -foo2 = 'pncA_Chromosome:g.2288693_2289280del; WT; pncA_p.Thr61Ala' +#foo1 = 'pncA_p.Thr47Pro;pncA_p.Thr61Pro;rpsA_c.XX' +#foo2 = 'pncA_Chromosome:g.2288693_2289280del; WT; pncA_p.Thr61Ala' -foo1_s = foo1.split(';') -foo1_s -nssnp_match2 = re.compile('(pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})') -arse=list(filter(nssnp_match2.match, foo1_s)) -arse +#foo1_s = foo1.split(';') +#foo1_s +#nssnp_match2 = re.compile('(pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})') +#arse=list(filter(nssnp_match2.match, foo1_s)) +#arse + +#foo1_s2 = ';'.join(arse) +#foo1_s2 -foo1_s2 = ';'.join(arse) -foo1_s2 #%% -nssnp_match2 = re.compile('(pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})') +#nssnp_match2 = re.compile('(pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})') # dr_muts_col dr_clean_col = dr_muts_col + '_clean' @@ -248,6 +248,7 @@ for i, v in enumerate(meta_gene_epi[dr_muts_col]): dr2_s = v.split(';') print(dr2_s) dr2_sf = list(filter(nssnp_match2.match, dr2_s)) + #dr2_sf = list(filter(nssnp_match.match, dr2_s)) print(dr2_sf) dr2_sf2 = ';'.join(dr2_sf) meta_gene_epi[dr_clean_col].iloc[i] = dr2_sf2 @@ -262,13 +263,13 @@ meta_gene_epi[other_clean_col] = '' for i, v in enumerate(meta_gene_epi[other_muts_col]): #print(i, v) - print('======================================================') - print(i) - print(v) + #print('======================================================') + #print(i) + #print(v) other2_s = v.split(';') - print(other2_s) + #print(other2_s) other2_sf = list(filter(nssnp_match2.match, other2_s)) - print(other2_sf) + #print(other2_sf) other2_sf2 = ';'.join(other2_sf) meta_gene_epi[other_clean_col].iloc[i] = other2_sf2 @@ -281,7 +282,8 @@ meta_gene_epi_f = meta_gene_epi[['id', 'sample' , 'dr_mult_snp_count' , other_muts_col, other_clean_col , 'other_mult_snp_count']] -meta_gene_epi_f.columns +#print(meta_gene_epi_f.columns) +print(meta_gene_epi_f) cols_to_output = ['id', 'sample' , dr_clean_col @@ -293,7 +295,6 @@ cols_to_output = ['id', 'sample' meta_gene_epi_f2 = meta_gene_epi_f[cols_to_output] - #%% # formatting, replace !nssnp_match with nothing #nssnp_neg_match = '(?!pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})' diff --git a/scripts/kd_df.py b/scripts/kd_df.py index 53b1264..50748de 100755 --- a/scripts/kd_df.py +++ b/scripts/kd_df.py @@ -92,7 +92,7 @@ else: infile_fasta = indir + '/' + in_filename_fasta print('Input fasta file:', infile_fasta , '\n============================================================') - + #======= # output #======= diff --git a/scripts/plotting/basic_barplots_combined.R b/scripts/plotting/basic_barplots_combined.R old mode 100755 new mode 100644 diff --git a/scripts/plotting/corr_adjusted_PS_LIG.R b/scripts/plotting/corr_adjusted_PS_LIG.R old mode 100755 new mode 100644 diff --git a/scripts/plotting/dirs.R b/scripts/plotting/dirs.R old mode 100755 new mode 100644 diff --git a/scripts/plotting/dist_plots_check.R b/scripts/plotting/dist_plots_check.R old mode 100755 new mode 100644 diff --git a/scripts/plotting/extreme_muts.R b/scripts/plotting/extreme_muts.R old mode 100755 new mode 100644 diff --git a/scripts/plotting/get_plotting_dfs.R b/scripts/plotting/get_plotting_dfs.R old mode 100755 new mode 100644 diff --git a/scripts/plotting/ggcorr_all_PS_LIG.R b/scripts/plotting/ggcorr_all_PS_LIG.R old mode 100755 new mode 100644 diff --git a/scripts/plotting/hist_af_or_base.R b/scripts/plotting/hist_af_or_base.R old mode 100755 new mode 100644 diff --git a/scripts/plotting/hist_af_or_combined.R b/scripts/plotting/hist_af_or_combined.R old mode 100755 new mode 100644 diff --git a/scripts/plotting/legend_adjustment.R b/scripts/plotting/legend_adjustment.R old mode 100755 new mode 100644 diff --git a/scripts/plotting/opp_mcsm_muts.R b/scripts/plotting/opp_mcsm_muts.R old mode 100755 new mode 100644 diff --git a/scripts/plotting/or_plots_combined.R b/scripts/plotting/or_plots_combined.R old mode 100755 new mode 100644 diff --git a/scripts/plotting/other_plots_combined.R b/scripts/plotting/other_plots_combined.R old mode 100755 new mode 100644 diff --git a/scripts/plotting/output_tables.R b/scripts/plotting/output_tables.R old mode 100755 new mode 100644 diff --git a/scripts/plotting/ps_plots_combined.R b/scripts/plotting/ps_plots_combined.R old mode 100755 new mode 100644 diff --git a/scripts/plotting/resolving_ambiguous_muts.R b/scripts/plotting/resolving_ambiguous_muts.R old mode 100755 new mode 100644