From a2bcc3a73255058e063053c47d768bfbfe065c6d Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Wed, 25 May 2022 08:50:33 +0100 Subject: [PATCH] added mmcsm_lig and provean dfs merges in comnining_df.py --- scripts/combining_dfs.py | 175 +++++++++++++++++++++++++++++++++++++-- scripts/count_vars_ML.R | 56 ++++++++++++- 2 files changed, 220 insertions(+), 11 deletions(-) diff --git a/scripts/combining_dfs.py b/scripts/combining_dfs.py index c18b8ae..62b5543 100755 --- a/scripts/combining_dfs.py +++ b/scripts/combining_dfs.py @@ -23,6 +23,9 @@ Created on Tue Aug 6 12:56:03 2019 #12) .lower()'_mcsm_ppi2.csv' #13) .lower()'_consurf.csv' #14) .lower()'_snap2.csv' +#15) .lower()'_provean.csv +#16) .lower()'_mmcsm_lig_results.csv' +#17) .lower()'_edXXX'!!!! TODO # combining order @@ -43,6 +46,7 @@ from pandas import DataFrame import numpy as np import argparse from functools import reduce +from sklearn.preprocessing import MinMaxScaler #======================================================================= #%% specify input and curr dir homedir = os.path.expanduser('~') @@ -168,8 +172,7 @@ infile_mcsm_f_snps = outdir + infilename_mcsm_f_snps mcsm_f_snps = pd.read_csv(infile_mcsm_f_snps, sep = ',', names = ['mutationinformation'], header = None) # more output added -## consurf [change colnames] - +## consurf [change colnames]? [add outcome category] infilename_consurf = gene.lower() + '_consurf_grades_f.csv' infile_consurf = outdir + 'consurf/'+ infilename_consurf consurf_df = pd.read_csv(infile_consurf, sep = ',') @@ -179,6 +182,16 @@ infilename_snap2 = gene.lower() + '_snap2_output.csv' infile_snap2 = outdir + 'snap2/'+ infilename_snap2 snap2_df = pd.read_csv(infile_snap2, sep = ',') +## PROVEAN +infilename_provean = gene.lower() + '_provean.csv' +infile_provean = outdir + 'provean/'+ infilename_provean +provean_df = pd.read_csv(infile_provean, sep = ',',header = None ) + +# mmCSM-lig +infilename_mmcsm = gene.lower() + '_mmcsm_results.csv' +infile_mmcsm = outdir + 'mmcsm_lig/single_muts/'+ infilename_mmcsm +mmcsm_lig_raw = pd.read_csv(infile_mmcsm, sep = ',') + #------------------------------------------------------------------------------ # ONLY: for gene 'gid' and 'rpob': End logic should pick this up! geneL_na = ['gid', 'rpob'] @@ -737,7 +750,117 @@ else: , 'snap2_scaled' , 'snap2_accuracy_pc' , 'snap2_outcome']] - + +#======================= +# Provean +#======================= +provean_df.head() +provean_df.columns = ['mutationinformation', 'provean_score', 'provean_outcome'] +provean_df.head() +provean_df['provean_outcome'].value_counts() + +#---------------------------------------- +# Rescale values in provean_score +# col b/w -1 and 1 so negative numbers +# stay neg and pos numbers stay positive +# cut off =-2.5 +# so provean scores >= (-2.5) are neutral +# and provean scores < (2.5) are deleterious +#----------------------------------------- +provean_min = provean_df['provean_score'].min() +provean_max = provean_df['provean_score'].max() +print('\nprovean_score (MIN):', provean_min + , '\nprovean_score (MAX):', provean_max) + +# quick check +provean_cut_off = -2.5 + +if (provean_df['provean_score'] > provean_cut_off).sum() == provean_df['provean_outcome'].value_counts()['Neutral']: + print('\nPASS: Provean cut off is indeed:', provean_cut_off + , '\nNo. of values above', provean_cut_off, 'i.e classed as Neutral:' + , (provean_df['provean_score']>provean_cut_off).sum() + , '\nProvean outcome:' + , '\nNeutral:', len(provean_df.loc[provean_df['provean_score'] > provean_cut_off]) + , '\nDeleterious:', len(provean_df.loc[provean_df['provean_score'] < provean_cut_off]) + ) +else: + sys.exit('\nFAIL: Numbers mismatch. Please check provean cut off and condition used') + +# RECHECK logic!: CANNOT use this as it changes the data distribution, as seen from the his plot +# provean_scale = lambda x : x/abs(provean_min) if x < 0 else (x/provean_min if x >= 0 else 'failed') +# provean_df['provean_scaled1'] = provean_df.loc[:,'provean_score'].apply(provean_scale) +# print('\nRaw provean scores:\n', provean_df['provean_score'] +# , '\n---------------------------------------------------------------' +# , '\nScaled provean scores:\n', provean_df['provean_scaled']) + +# print('\nprovean raw (Max):' , provean_df['provean_score'].max() +# , '\nprovean scaled (Max):' , provean_df['provean_scaled1'].max()) +# print('\nprovean raw (Min):' , provean_df['provean_score'].min() +# , '\nprovean scaled (Min):' , provean_df['provean_scaled1'].min()) + +scaler = MinMaxScaler() +provean_df['provean_scaled'] = scaler.fit_transform(provean_df['provean_score'].values.reshape(-1,1)) + +provean_df['provean_score'].hist(bins = 30) +#provean_df['provean_scaled1'].hist(bins = 10) +provean_df['provean_scaled'].hist(bins = 30) + +#======================= +# mmCSM-lig +#======================= +mmcsm_lig_raw.columns + +# extract specific columns: might be simpler +mmcsm_lig_df = mmcsm_lig_raw[['MUTATION', 'CHAIN', 'DDG']] +mmcsm_lig_df['CHAIN'].value_counts() + +# Drop the chain column +mmcsm_lig_df.drop(['CHAIN'], axis = 1, inplace = True) + +# Rename columns using lower case and consistently to allow merge later on +mmcsm_lig_df.rename({'MUTATION': 'mutationinformation' + , 'DDG': 'mmcsm_lig'}, axis = 1, inplace = True) + +#---------------------------------------- +# Rescale values in mmcsm_lig_affinity +# col b/w -1 and 1 so negative numbers +# stay neg and pos numbers stay positive +#----------------------------------------- +mmcsm_lig_min = mmcsm_lig_df['mmcsm_lig'].min() +mmcsm_lig_max = mmcsm_lig_df['mmcsm_lig'].max() +print('\nmmcsm_lig (MIN):', mmcsm_lig_min + , '\nmmcsm_lig (MAX):', mmcsm_lig_max) + +# quick check +print('\nNo. of Stabilising mmCSM mutations:', len(mmcsm_lig_df.loc[mmcsm_lig_df['mmcsm_lig'] >= 0])) +print('\nNo. of Destabilising mmCSM mutations:', len(mmcsm_lig_df.loc[mmcsm_lig_df['mmcsm_lig'] < 0])) + +mmcsm_ligscale = lambda x : x/abs(mmcsm_lig_min) if x < 0 else (x/mmcsm_lig_max if x >= 0 else 'failed') + +mmcsm_lig_df['mmcsm_lig_scaled'] = mmcsm_lig_df.loc[:,'mmcsm_lig'].apply(mmcsm_ligscale) +print('\nRaw mmcsm_lig scores:\n', mmcsm_lig_df['mmcsm_lig'] + , '\n---------------------------------------------------------------' + , '\nScaled mmcsm_lig scores:\n', mmcsm_lig_df['mmcsm_lig_scaled']) + +print('\nmmCSM lig raw (Max):', mmcsm_lig_df['mmcsm_lig'].max() + , '\nmmCSM lig scaled (Max):', mmcsm_lig_df['mmcsm_lig_scaled'].max()) + +print('\nmmCSM lig raw (Min):', mmcsm_lig_df['mmcsm_lig'].min() + , '\nmmCSM lig scaled (Min):', mmcsm_lig_df['mmcsm_lig_scaled'].min()) + +mmcsm_lig_df['mmcsm_lig_scaled'].hist(bins = 30) +mmcsm_lig_df['mmcsm_lig'].hist(bins = 30) + +#----------------------------- +# mmCSM lig outcome category: +# -ve: Destabilising +# +ve: Stabilising +#---------------------------- +mmcsm_lig_df['mmcsm_lig_outcome'] = mmcsm_lig_df.loc[:,'mmcsm_lig'].apply(lambda x: 'Destabilising' if x < 0 else 'Stabilising') +mmcsm_lig_df[mmcsm_lig_df['mmcsm_lig']<0].count() + +del(mmcsm_lig_raw) + #%%============================================================================ # Now merges begin print('===================================' @@ -1201,9 +1324,47 @@ redundant_colsL = ['mutationinformation_snap2' combined_all_params_f = combined_all_params.drop(redundant_colsL , axis = 1 , errors = 'ignore') +#--------------------------------------- +# MERGE 7 [UQ]: provean and mmCSM-lig dfs +#--------------------------------------- +if len(combined_all_params_f) == len(provean_df) == len(mmcsm_lig_df): + print('\nPASS: length of Provean and mmCSM-lig df length match with combined_df.' + , '\nProceeding with FINAL merging before writing file...') +else: + sys.exit('\nFAIL: Cannot do final merge! Check lengths of consurf, provean and combined_all_params_f dfs') + +merging_cols_m7 = detect_common_cols(provean_df, mmcsm_lig_df) +print('\nMering provean and mmcsm-lig dfs on:', merging_cols_m7) +pr_mm_df = pd.merge(provean_df + , mmcsm_lig_df + , on = merging_cols_m7) + +#----------------------------------------------- +# MERGE 8 [UQ]: combined_all_params_f + pr_mm_df +#----------------------------------------------- +merging_cols_m8 = detect_common_cols(combined_all_params_f, pr_mm_df) +print('\nMering all combined_dfs + (pr_mm_df) on:', merging_cols_m8) + +combined_all_params_f2 = pd.merge(combined_all_params_f + , pr_mm_df + , on = merging_cols_m8) + +expected_ncols = len(combined_all_params_f.columns) + len(pr_mm_df.columns) - len(merging_cols_m8) +expected_nrows = len(combined_all_params_f2) + +if len(combined_all_params_f2.columns) == expected_ncols and len(combined_all_params_f2) == expected_nrows: + print('\nPASS: All dfs combined including PROVEAN and mmCSM-lig') +else: + print('\nFAIL:lengths mismatch' + , '\nExpected ncols:', expected_ncols + , '\nGot:', len(combined_all_params_f2.columns) + , '\nExpected nrows:', expected_nrows + , '\nGot:', len(combined_all_params_f2) ) +#--------------------------------------- # Add pdb_file name at the end -combined_all_params_f['pdb_file'] = gene_pdb_f +#--------------------------------------- +combined_all_params_f2['pdb_file'] = gene_pdb_f #%%============================================================================ #--------------------- @@ -1213,9 +1374,9 @@ print('\nWriting file: all params') print('\nOutput 3 filename:', outfile_comb , '\n===================================================================\n') -combined_all_params_f.to_csv(outfile_comb, index = False) +combined_all_params_f2.to_csv(outfile_comb, index = False) print('\nFinished writing file:' - , '\nNo. of rows:', combined_all_params_f.shape[0] - , '\nNo. of cols:', combined_all_params_f.shape[1]) + , '\nNo. of rows:', combined_all_params_f2.shape[0] + , '\nNo. of cols:', combined_all_params_f2.shape[1]) #%% end of script diff --git a/scripts/count_vars_ML.R b/scripts/count_vars_ML.R index 4cdf80c..8d2f4b8 100644 --- a/scripts/count_vars_ML.R +++ b/scripts/count_vars_ML.R @@ -1,13 +1,17 @@ # count numbers for ML #source("~/git/LSHTM_analysis/config/alr.R") -source("~/git/LSHTM_analysis/config/embb.R") - +#source("~/git/LSHTM_analysis/config/embb.R") #source("~/git/LSHTM_analysis/config/gid.R") -#source("~/git/LSHTM_analysis/config/pnca.R") +#source("~/git/LSHTM_analysis/config/katg.R") +source("~/git/LSHTM_analysis/config/pnca.R") +#source("~/git/LSHTM_analysis/config/rpob.R") source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") +gene +gene_match + nrow(merged_df3) ############################################## #============= @@ -15,7 +19,7 @@ nrow(merged_df3) #============== table(merged_df3$mutation_info) sum(table(merged_df3$mutation_info)) -sum(table(merged_df3$mutation_info)) +table(merged_df3$mutation_info_orig) ############################################## #============= @@ -64,3 +68,47 @@ sum(table(merged_df3$drtype_mode_labels)) table(merged_df3$lineage) sum(table(merged_df3$lineage_labels)) +# write file +outfile_merged_df3 = paste0(outdir, '/', tolower(gene), '_merged_df3.csv') +outfile_merged_df3 +write.csv(merged_df3, outfile_merged_df3) + +outfile_merged_df2 = paste0(outdir, '/', tolower(gene), '_merged_df2.csv') +outfile_merged_df2 +write.csv(merged_df2, outfile_merged_df2) + +################################################### +################################################### +################################################### + +source("~/git/LSHTM_analysis/config/alr.R") +source("~/git/LSHTM_analysis/config/embb.R") +source("~/git/LSHTM_analysis/config/gid.R") +source("~/git/LSHTM_analysis/config/katg.R") +source("~/git/LSHTM_analysis/config/pnca.R") +source("~/git/LSHTM_analysis/config/rpob.R") + +df3_filename = paste0("/home/tanu/git/Data/", drug, "/output/", tolower(gene), "_merged_df3.csv") +df3 = read.csv(df3_filename) + +# mutationinformation +length(unique((df3$mutationinformation))) + +#dm _om +table(df3$mutation_info) +table(df3$mutation_info_labels) +table(df3$mutation_info_orig) +table(df3$mutation_info_labels_orig) + +# test_set +na_count <-sapply(df3, function(y) sum(length(which(is.na(y))))) +na_count[drug] + +# training set +table(df3[drug]) + +# drtype: MDR and XDR +#table(df3$drtype) orig i.e. incorrect ones! +table(df3$drtype_mode_labels) + +