From aaa24ca32d17767b111ec06c4e3f180b88792500 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Fri, 4 Jun 2021 15:05:52 +0100 Subject: [PATCH] minor updates to dir.R --- scripts/combining_dfs.py | 383 --------------------------- scripts/plotting/basic_barplots_PS.R | 29 +- scripts/plotting/dirs.R | 8 +- scripts/plotting/plotting_data.R | 9 +- 4 files changed, 11 insertions(+), 418 deletions(-) diff --git a/scripts/combining_dfs.py b/scripts/combining_dfs.py index 1e66328..b15f59c 100755 --- a/scripts/combining_dfs.py +++ b/scripts/combining_dfs.py @@ -238,9 +238,6 @@ combined_df[merging_cols_m4].apply(len) == len(combined_df) #deepddg_df = pd.read_csv(infile_deepddg, sep = ' ') - - - #%%============================================================================ # Output columns @@ -257,384 +254,4 @@ print('\nFinished writing file:' , '\nNo. of cols:', combined_df.shape[1]) - - - - - -#%%============================================================================ -# OR merges: TEDIOUSSSS!!!! -#[ DELETE ] -del(mcsm_df, foldx_df, mcsm_foldx_dfs, dssp_kd_dfs, dssp_kd_rd_dfs,rd_df, kd_df, infile_mcsm, infile_foldx, infile_dssp, infile_kd) -del(merging_cols_m1, merging_cols_m2, merging_cols_m3, merging_cols_m4) -del(in_filename_dssp, in_filename_foldx, in_filename_kd, in_filename_mcsm, in_filename_rd) -#%% -print('===================================' - , '\nFifth merge: afor_df + afor_kin_df' - , '\n===================================') - -# OR combining -afor_df = pd.read_csv(infile_afor, sep = ',') -#afor_df.columns = afor_df.columns.str.lower() - -afor_kin_df = pd.read_csv(infile_afor_kin, sep = ',') -afor_kin_df.columns = afor_kin_df.columns.str.lower() - -merging_cols_m5 = detect_common_cols(afor_df, afor_kin_df) - -print('Dim of afor_df:', afor_df.shape - , '\nDim of afor_kin_df:', afor_kin_df.shape) - -# finding if ALL afor_kin_df muts are present in afor_df -# i.e all kinship muts should be PRESENT in mycalcs_present -if len(afor_kin_df[afor_kin_df['mutation'].isin(afor_df['mutation'])]) == afor_kin_df.shape[0]: - print('PASS: ALL', len(afor_kin_df), 'or_kinship muts are present in my or list') -else: - nf_muts = len(afor_kin_df[~afor_kin_df['mutation'].isin(afor_df['mutation'])]) - nf_muts_df = afor_kin_df[~afor_kin_df['mutation'].isin(afor_df['mutation'])] - print('FAIL:', nf_muts, 'muts present in afor_kin_df NOT present in my or list' - , '\nsee "nf_muts_df" created containing not found(nf) muts') - sys.exit() - - -# Now checking how many afor_df muts are NOT present in afor_kin_df -common_muts = len(afor_df[afor_df['mutation'].isin(afor_kin_df['mutation'])]) -extra_muts_myor = afor_kin_df.shape[0] - common_muts - -print('==========================================' - , '\nmy or calcs has', common_muts, 'present in af_or_kin_df' - , '\n==========================================') - -print('Expected cals for merging with outer_join...') - -expected_rows = afor_df.shape[0] + extra_muts_myor -expected_cols = afor_df.shape[1] + afor_kin_df.shape[1] - len(merging_cols_m5) - - -afor_df['mutation'] -afor_kin_df['mutation'] - -ors_df = pd.merge(afor_df, afor_kin_df, on = merging_cols_m5, how = o_join) - -if ors_df.shape[0] == expected_rows and ors_df.shape[1] == expected_cols: - print('PASS but with duplicate muts: OR dfs successfully combined! PHEWWWW!' - , '\nDuplicate muts present but with different \'ref\' and \'alt\' alleles') -else: - print('FAIL: could not combine OR dfs' - , '\nCheck expected rows and cols calculation and join type') - -print('Dim of merged ors_df:', ors_df.shape) - -ors_df[merging_cols_m5].apply(len) -ors_df[merging_cols_m5].apply(len) == len(ors_df) - -#%%============================================================================ -# formatting ors_df -ors_df.columns - -# Dropping unncessary columns: already removed in ealier preprocessing -cols_to_drop = ['n_miss'] -print('Dropping', len(cols_to_drop), 'columns:\n' - , cols_to_drop) -ors_df.drop(cols_to_drop, axis = 1, inplace = True) - -print('Reordering', ors_df.shape[1], 'columns' - , '\n===============================================') -cols = ors_df.columns - -column_order = ['mutation' - , 'mutationinformation' - , 'wild_type' - , 'position' - , 'mutant_type' - , 'ref_allele' - , 'alt_allele' - , 'mut_info_f1' - , 'mut_info_f2' - , 'mut_type' - , 'gene_id' - , 'gene_name' - , 'chromosome_number' - , 'af' - , 'af_kin' - , 'est_chisq' - , 'or_mychisq' - , 'or_fisher' - , 'or_logistic' - , 'or_kin' - , 'pval_chisq' - , 'pval_fisher' - , 'pval_logistic' - , 'pwald_kin' - , 'ci_low_fisher' - , 'ci_hi_fisher' - , 'ci_low_logistic' - , 'ci_hi_logistic' - , 'beta_logistic' - , 'beta_kin' - , 'se_logistic' - , 'se_kin' - , 'zval_logistic' - , 'logl_h1_kin' - , 'l_remle_kin'] - -if ( (len(column_order) == ors_df.shape[1]) and (DataFrame(column_order).isin(ors_df.columns).all().all())): - print('PASS: Column order generated for all:', len(column_order), 'columns' - , '\nColumn names match, safe to reorder columns' - , '\nApplying column order to df...' ) - ors_df_ordered = ors_df[column_order] -else: - print('FAIL: Mismatch in no. of cols to reorder' - , '\nNo. of cols in df to reorder:', ors_df.shape[1] - , '\nNo. of cols order generated for:', len(column_order)) - sys.exit() - -print('\nResult of Sixth merge:', ors_df_ordered.shape - , '\n===================================================================') -#%% -ors_df_ordered.shape -check = ors_df_ordered[['mutationinformation','mutation', 'wild_type', 'position', 'mutant_type']] - -# populating 'nan' info -lookup_dict = dict() -for k, v in low_3letter_dict.items(): - lookup_dict[k] = v['one_letter_code'] - #print(lookup_dict) - - wt = ors_df_ordered['mutation'].str.extract(wt_regex).squeeze() - #print(wt) - ors_df_ordered['wild_type'] = wt.map(lookup_dict) - - ors_df_ordered['position'] = ors_df_ordered['mutation'].str.extract(pos_regex) - - mt = ors_df_ordered['mutation'].str.extract(mut_regex).squeeze() - ors_df_ordered['mutant_type'] = mt.map(lookup_dict) - -ors_df_ordered['mutationinformation'] = ors_df_ordered['wild_type'] + ors_df_ordered.position.map(str) + ors_df_ordered['mutant_type'] -check = ors_df_ordered[['mutationinformation','mutation', 'wild_type', 'position', 'mutant_type']] - -# populate mut_info_f1 -ors_df_ordered['mut_info_f1'].isna().sum() -ors_df_ordered['mut_info_f1'] = ors_df_ordered['position'].astype(str) + ors_df_ordered['wild_type'] + '>' + ors_df_ordered['position'].astype(str) + ors_df_ordered['mutant_type'] -ors_df_ordered['mut_info_f1'].isna().sum() - -# populate mut_info_f2 -ors_df_ordered['mut_info_f2'] = ors_df_ordered['mutation'].str.replace(gene_match.lower(), 'p.', regex = True) - -# populate mut_type -ors_df_ordered['mut_type'].isna().sum() -#mut_type_word = ors_df_ordered['mut_type'].value_counts() -mut_type_word = 'missense' # FIXME, should be derived -ors_df_ordered['mut_type'].fillna(mut_type_word, inplace = True) -ors_df_ordered['mut_type'].isna().sum() - -# populate gene_id -ors_df_ordered['gene_id'].isna().sum() -#gene_id_word = ors_df_ordered['gene_id'].value_counts() -gene_id_word = 'Rv2043c' # FIXME, should be derived -ors_df_ordered['gene_id'].fillna(gene_id_word, inplace = True) -ors_df_ordered['gene_id'].isna().sum() - -# populate gene_name -ors_df_ordered['gene_name'].isna().sum() -ors_df_ordered['gene_name'].value_counts() -ors_df_ordered['gene_name'].fillna(gene, inplace = True) -ors_df_ordered['gene_name'].isna().sum() - -# check numbers -ors_df_ordered['or_kin'].isna().sum() -# should be 0 -ors_df_ordered['or_mychisq'].isna().sum() - -#%%============================================================================ -print('===================================' - , '\nSixth merge: Fourth + Fifth merge' - , '\ncombined_df + ors_df_ordered' - , '\n===================================') - -#combined_df_all = combine_dfs_with_checks(combined_df, ors_df_ordered, my_join = i_join) -merging_cols_m6 = detect_common_cols(combined_df, ors_df_ordered) - -# dtype problems -if len(merging_cols_m6) > 1 and 'position'in merging_cols_m6: - print('Removing \'position\' from merging_cols_m6 to make dtypes consistent' - , '\norig length of merging_cols_m6:', len(merging_cols_m6)) - merging_cols_m6.remove('position') - print('\nlength after removing:', len(merging_cols_m6)) - -print('Dim of df1:', combined_df.shape - , '\nDim of df2:', ors_df_ordered.shape - , '\nNo. of merging_cols:', len(merging_cols_m6)) - -print('Checking mutations in the two dfs:' - , '\nmuts in df1 present in df2:' - , combined_df['mutationinformation'].isin(ors_df_ordered['mutationinformation']).sum() - , '\nmuts in df2 present in df1:' - , ors_df_ordered['mutationinformation'].isin(combined_df['mutationinformation']).sum()) - -#---------- -# merge 6 -#---------- -combined_df_all = pd.merge(combined_df, ors_df_ordered, on = merging_cols_m6, how = o_join) -combined_df_all.shape - -# sanity check for merge 6 -outdf_expected_rows = len(combined_df) + extra_muts_myor -unique_muts = len(combined_df) -outdf_expected_cols = len(combined_df.columns) + len(ors_df_ordered.columns) - len(merging_cols_m6) - -if combined_df_all.shape[0] == outdf_expected_rows and combined_df_all.shape[1] == outdf_expected_cols and combined_df_all['mutationinformation'].nunique() == unique_muts: - print('PASS: Df dimension match' - , '\ncombined_df_all with join type:', o_join - , '\n', combined_df_all.shape - , '\n===============================================================') -else: - print('FAIL: Df dimension mismatch' - , 'Cannot generate expected dim. See details of merge performed' - , '\ndf1 dim:', combined_df.shape - , '\ndf2 dim:', ors_df_ordered.shape - , '\nGot:', combined_df_all.shape - , '\nmuts in df1 but NOT in df2:' - , combined_df['mutationinformation'].isin(ors_df_ordered['mutationinformation']).sum() - , '\nmuts in df2 but NOT in df1:' - , ors_df['mutationinformation'].isin(combined_df['mutationinformation']).sum()) - sys.exit() - -# drop extra cols -all_cols = combined_df_all.columns - -#pos_cols_check = combined_df_all[['position_x','position_y']] -c = combined_df_all[['position_x','position_y']].isna().sum() -pos_col_to_drop = c.index[c>0].to_list() -cols_to_drop = pos_col_to_drop + ['wild_type_kd'] - -print('Dropping', len(cols_to_drop), 'columns:\n', cols_to_drop) -combined_df_all.drop(cols_to_drop, axis = 1, inplace = True) - -# rename position_x to position -pos_col_to_rename = c.index[c==0].to_list() -combined_df_all.shape -combined_df_all.rename(columns = { pos_col_to_rename[0]: 'position'}, inplace = True) -combined_df_all.shape - -all_cols = combined_df_all.columns - -#%% reorder cols to for convenience -first_cols = ['mutationinformation','mutation', 'wild_type', 'position', 'mutant_type'] -last_cols = [col for col in combined_df_all.columns if col not in first_cols] - -combined_df_all = combined_df_all[first_cols+last_cols] - -#%% IMPORTANT: check if mutation related info is all populated after this merge -# select string colnames to ensure no NA exist there -string_cols = combined_df_all.columns[combined_df_all.applymap(lambda x: isinstance(x, str)).all(0)] - -if (combined_df_all[string_cols].isna().sum(axis = 0)== 0).all(): - print('PASS: All string cols are populated with no NAs') -else: - print('FAIL: NAs detected in string cols') - print(combined_df_all[string_cols].isna().sum(axis = 0)) - sys.exit() - -# relevant mut cols -check_mut_cols = merging_cols_m5 + merging_cols_m6 - -count_na_mut_cols = combined_df_all[check_mut_cols].isna().sum().reset_index().rename(columns = {'index': 'col_name', 0: 'na_count'}) -print(check_mut_cols) - -c2 = combined_df_all[check_mut_cols].isna().sum() -missing_info_cols = c2.index[c2>0].to_list() - -if c2.sum()>0: - #na_muts_n = combined_df_all['mutation'].isna().sum() - na_muts_n = combined_df_all[missing_info_cols].isna().sum() - print(na_muts_n.values[0], 'mutations have missing \'mutation\' info.' - , '\nFetching these from reference dict...') -else: - print('No missing \'mutation\' has been detected!') - -lookup_dict = dict() -for k, v in oneletter_aa_dict.items(): - lookup_dict[k] = v['three_letter_code_lower'] - print(lookup_dict) - wt_3let = combined_df_all['wild_type'].map(lookup_dict) - #print(wt_3let) - pos = combined_df_all['position'].astype(str) - #print(pos) - mt_3let = combined_df_all['mutant_type'].map(lookup_dict) - #print(mt_3let) - # override the 'mutation' column - combined_df_all['mutation'] = 'pnca_p.' + wt_3let + pos + mt_3let - print(combined_df_all['mutation']) - -# check again -if combined_df_all[missing_info_cols].isna().sum().all() == 0: - print('PASS: No mutations have missing \'mutation\' info.') -else: - print('FAIL:', combined_df_all[missing_info_cols].isna().sum().values[0] - , '\nmutations have missing info STILL...') - sys.exit() - -#%% check -foo = combined_df_all.drop_duplicates('mutationinformation') -foo2 = combined_df_all.drop_duplicates('mutation') -if foo.equals(foo2): - print('PASS: Dropping mutation or mutatationinformation has the same effect\n') -else: - print('FAIL: Still problems in merged data') - sys.exit() - -#%%============================================================================ -output_cols = combined_df_all.columns - -#%% IMPORTANT result info -if combined_df_all['or_mychisq'].isna().sum() == len(combined_df) - len(afor_df): - print('PASS: No. of NA in or_mychisq matches expected length' - , '\nNo. of with NA in or_mychisq:', combined_df_all['or_mychisq'].isna().sum() - , '\nNo. of NA in or_kin:', combined_df_all['or_kin'].isna().sum()) -else: - print('FAIL: No. of NA in or_mychisq does not match expected length') - - -if combined_df_all.shape[0] == outdf_expected_rows: - print('\nINFORMARIONAL ONLY: combined_df_all has duplicate muts present but with unique ref and alt allele' - , '\n=============================================================') -else: - print('combined_df_all has no duplicate muts present' - ,'\n===============================================================') - -print('\nDim of combined_data:', combined_df_all.shape - , '\nNo. of unique mutations:', combined_df_all['mutationinformation'].nunique()) - - -#%%============================================================================ -# write csv -print('Writing file: combined output of all params needed for plotting and ML') -combined_df_all.to_csv(outfile_comb, index = False) -print('\nFinished writing file:' - , '\nNo. of rows:', combined_df_all.shape[0] - , '\nNo. of cols:', combined_df_all.shape[1]) -#======================================================================= -#%% incase you FIX the the function: combine_dfs_with_checks -#def main(): - -# print('Reading input files:') - #mcsm_df = pd.read_csv(infile_mcsm, sep = ',') - #mcsm_df.columns = mcsm_df.columns.str.lower() - - #foldx_df = pd.read_csv(infile_foldx , sep = ',') - - #dssp_df = pd.read_csv(infile_dssp, sep = ',') - #dssp_df.columns = dssp_df.columns.str.lower() - - #kd_df = pd.read_csv(infile_kd, sep = ',') - #kd_df.columns = kd_df.columns.str.lower() - - #rd_df = pd.read_csv(infile_kd, sep = ',') - - - -#if __name__ == '__main__': -# main() -#======================================================================= #%% end of script diff --git a/scripts/plotting/basic_barplots_PS.R b/scripts/plotting/basic_barplots_PS.R index 1d6e681..aea1c21 100755 --- a/scripts/plotting/basic_barplots_PS.R +++ b/scripts/plotting/basic_barplots_PS.R @@ -156,34 +156,7 @@ foo = select(df, mutationinformation svg(plot_pos_count_duet) print(paste0("plot filename:", plot_pos_count_duet)) -my_ats = 25 # axis text size -my_als = 22 # axis label size - -# to make x axis display all positions -# not sure if to use with sort or directly -my_x = sort(unique(snpsBYpos_df$snpsBYpos)) - -g = ggplot(snpsBYpos_df, aes(x = snpsBYpos)) -OutPlot_pos_count = g + geom_bar(aes (alpha = 0.5) - , show.legend = FALSE) + - scale_x_continuous(breaks = unique(snpsBYpos_df$snpsBYpos)) + - #scale_x_continuous(breaks = my_x) + - geom_label(stat = "count", aes(label = ..count..) - , color = "black" - , size = 10) + - theme(axis.text.x = element_text(size = my_ats - , angle = 0) - , axis.text.y = element_text(size = my_ats - , angle = 0 - , hjust = 1) - , axis.title.x = element_text(size = my_als) - , axis.title.y = element_text(size = my_als) - , plot.title = element_blank()) + - - labs(x = "Number of nsSNPs" - , y = "Number of Sites") - -print(OutPlot_pos_count) +source("dirs.R") dev.off() ######################################################################## # end of PS barplots diff --git a/scripts/plotting/dirs.R b/scripts/plotting/dirs.R index aa1b9be..49783af 100644 --- a/scripts/plotting/dirs.R +++ b/scripts/plotting/dirs.R @@ -12,10 +12,10 @@ import_dirs <- function(drug, gene) { #============= # directories and variables #============= - datadir <<- paste0("~/git/Data") - indir <<- paste0(datadir, "/", drug, "/input") - outdir <<- paste0("~/git/Data", "/", drug, "/output") - plotdir <<- paste0("~/git/Data", "/", drug, "/output/plots") + datadir <<- paste0("~/git/Data/") + indir <<- paste0(datadir, drug, "/input") + outdir <<- paste0("~/git/Data/", drug, "/output") + plotdir <<- paste0("~/git/Data/", drug, "/output/plots") dr_muts_col <<- paste0('dr_mutations_', drug) other_muts_col <<- paste0('other_mutations_', drug) diff --git a/scripts/plotting/plotting_data.R b/scripts/plotting/plotting_data.R index e4ee4ff..d7a0ec7 100755 --- a/scripts/plotting/plotting_data.R +++ b/scripts/plotting/plotting_data.R @@ -15,8 +15,6 @@ library(ggplot2) library(data.table) library(dplyr) require("getopt", quietly = TRUE) #cmd parse arguments -source("dirs.R") - #======================================================== # command line args spec = matrix(c( @@ -37,8 +35,13 @@ gene = "gid" if(is.null(drug)|is.null(gene)) { stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)") } - #======================================================== +# Load functions +# import dir structure +source("dirs.R") +import_dirs(drug, gene) +#======================================================= + #====== # input #======