From f355846dae4327ee96eff80aea3a50ef431fc281 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Wed, 15 Jun 2022 18:36:28 +0100 Subject: [PATCH] added active site indication for merged_dfs in count_vars_ML.R and also added 'gene_name' in combining_dfs.py --- scripts/combining_dfs.py | 46 +++++++++++++++++++++-------- scripts/count_vars_ML.R | 27 +++++++++++++++-- scripts/plotting/get_plotting_dfs.R | 1 + 3 files changed, 59 insertions(+), 15 deletions(-) diff --git a/scripts/combining_dfs.py b/scripts/combining_dfs.py index 62b5543..12e4a32 100755 --- a/scripts/combining_dfs.py +++ b/scripts/combining_dfs.py @@ -124,7 +124,7 @@ if gene.lower() == "embb": print("\nReading mCSM file for gene:", gene) #in_filename_mcsm = gene.lower() + '_complex_mcsm_norm1.csv' #798 #in_filename_mcsm = gene.lower() + '_complex_mcsm_norm2.csv' #844 - in_filename_mcsm = gene.lower() + '_complex_mcsm_norm3.csv' #851 + in_filename_mcsm = gene.lower() + '_complex_mcsm_norm3.csv' #858 [got all muts on 15/05/2022] if gene.lower() in gene_list_normal: print("\nReading mCSM file for gene:", gene) @@ -1081,21 +1081,35 @@ combined_df = pd.merge(combined_7dfs combined_df_expected_cols = len(combined_7dfs.columns) + len(snap2_df_f.columns) - len(merging_cols_m4_v2) -#%% Dimension checks for combined_df: +#%% Dimension checks for combined_df: specially because embB didn't have mCSM +# values for all muts. This was fixed in UQ, and this rerun on 15/05/2022 +#------------------------------------------------------------------------------ +# No longer required since mCSM values for fetched for all muts! # CHECK whether logic effects anything else! -if not gene == "embB": - print("\nGene is:", gene) - if len(combined_df) == len(mcsm_df) and len(combined_df.columns) == combined_df_expected_cols: - print('PASS: successfully combined 8 dfs' - , '\nNo. of rows combined_df:', len(combined_df) - , '\nNo. of cols combined_df:', len(combined_df.columns)) -elif len(combined_df) == len(foldx_df) and len(combined_df.columns) == combined_df_expected_cols: +# if not gene == "embB": +# print("\nGene is:", gene) +# if len(combined_df) == len(mcsm_df) and len(combined_df.columns) == combined_df_expected_cols: +# print('PASS: successfully combined 8 dfs' +# , '\nNo. of rows combined_df:', len(combined_df) +# , '\nNo. of cols combined_df:', len(combined_df.columns)) +# elif len(combined_df) == len(foldx_df) and len(combined_df.columns) == combined_df_expected_cols: +# print('PASS: successfully combined 8 dfs' +# , '\nNo. of rows combined_df:', len(combined_df) +# , '\nNo. of cols combined_df:', len(combined_df.columns)) +# else: +# sys.exit('FAIL: check individual merges for seventh merge') +#------------------------------------------------------------------------------ +print("\nRunning Dimension check for combined_df for gene:", gene) + +if len(combined_df) == len(mcsm_df) and len(combined_df.columns) == combined_df_expected_cols: print('PASS: successfully combined 8 dfs' - , '\nNo. of rows combined_df:', len(combined_df) - , '\nNo. of cols combined_df:', len(combined_df.columns)) + , '\nNo. of rows combined_df:', len(combined_df) + , '\nNo. of cols combined_df:', len(combined_df.columns)) else: - sys.exit('FAIL: check individual merges for seventh merge') - + sys.exit('FAIL: check individual merges for seventh merge') + +print('\n====================================================') + print('\nResult of Seventh merge:', combined_df.shape , '\n===================================================================') @@ -1361,6 +1375,12 @@ else: , '\nGot:', len(combined_all_params_f2.columns) , '\nExpected nrows:', expected_nrows , '\nGot:', len(combined_all_params_f2) ) + +#--------------------------------------- +# Add gene name +#--------------------------------------- +combined_all_params_f2['gene_name'] = gene.lower() + #--------------------------------------- # Add pdb_file name at the end #--------------------------------------- diff --git a/scripts/count_vars_ML.R b/scripts/count_vars_ML.R index 28cf570..12bc5bd 100644 --- a/scripts/count_vars_ML.R +++ b/scripts/count_vars_ML.R @@ -2,13 +2,33 @@ #source("~/git/LSHTM_analysis/config/alr.R") #source("~/git/LSHTM_analysis/config/embb.R") -#source("~/git/LSHTM_analysis/config/gid.R") +##source("~/git/LSHTM_analysis/config/gid.R") #source("~/git/LSHTM_analysis/config/katg.R") #source("~/git/LSHTM_analysis/config/pnca.R") -#source("~/git/LSHTM_analysis/config/rpob.R") +source("~/git/LSHTM_analysis/config/rpob.R") source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") +################################################ +# Add acticve site indication +############################################### +merged_df2$active_site = as.integer(merged_df2$position %in% active_aa_pos) +merged_df2_comp$active_site = as.integer(merged_df2_comp$position %in% active_aa_pos) + +merged_df3$active_site = as.integer(merged_df3$position %in% active_aa_pos) +merged_df3_comp$active_site = as.integer(merged_df3_comp$position %in% active_aa_pos) + +# sanity check +table(merged_df2$active_site) +table(merged_df3$active_site) + +if( all(table(merged_df2$active_site) == table(as.integer(merged_df2$position %in% active_aa_pos))) && + all(table(merged_df3$active_site) == table(as.integer(merged_df3$position %in% active_aa_pos))) +){ + cat('\nActive site indications successfully applied to merged_dfs for gene:', tolower(gene)) +} + + gene gene_match @@ -68,6 +88,9 @@ sum(table(merged_df3$drtype_mode_labels)) table(merged_df3$lineage) sum(table(merged_df3$lineage_labels)) +cat("\nWriting merged_df3 for:" + , "\nDrug:", drug + , "\nGene:", gene) # write file outfile_merged_df3 = paste0(outdir, '/', tolower(gene), '_merged_df3.csv') outfile_merged_df3 diff --git a/scripts/plotting/get_plotting_dfs.R b/scripts/plotting/get_plotting_dfs.R index 7c6b4e4..a2c77d3 100644 --- a/scripts/plotting/get_plotting_dfs.R +++ b/scripts/plotting/get_plotting_dfs.R @@ -87,6 +87,7 @@ merged_df2 = all_plot_dfs[[1]] merged_df3 = all_plot_dfs[[2]] merged_df2_comp = all_plot_dfs[[3]] merged_df3_comp = all_plot_dfs[[4]] + #====================================================================== ####################################################################