diff --git a/.gitignore b/.gitignore index ac8fba0..caf6279 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,4 @@ scratch test plotting_test scripts_old +foldx/test/ diff --git a/scripts/combining_dfs.py b/scripts/combining_dfs.py index ba40b46..8b8e556 100755 --- a/scripts/combining_dfs.py +++ b/scripts/combining_dfs.py @@ -54,6 +54,8 @@ os.getcwd() # FIXME: local imports #from combining import combine_dfs_with_checks from combining_FIXME import detect_common_cols +from reference_dict import oneletter_aa_dict # CHECK DIR STRUC THERE! +from reference_dict import low_3letter_dict # CHECK DIR STRUC THERE! #======================================================================= #%% command line args arg_parser = argparse.ArgumentParser() @@ -71,13 +73,27 @@ args = arg_parser.parse_args() #%% variable assignment: input and output #drug = 'pyrazinamide' #gene = 'pncA' -gene_match = gene + '_p.' drug = args.drug gene = args.gene datadir = args.datadir indir = args.input_dir outdir = args.output_dir + +gene_match = gene + '_p.' +print('mut pattern for gene', gene, ':', gene_match) + +nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}' +print('nsSNP for gene', gene, ':', nssnp_match) + +wt_regex = gene_match.lower()+'([A-Za-z]{3})' +print('wt regex:', wt_regex) + +mut_regex = r'[0-9]+(\w{3})$' +print('mt regex:', mut_regex) + +pos_regex = r'([0-9]+)' +print('position regex:', pos_regex) #%%======================================================================= #============== # directories @@ -155,6 +171,8 @@ ncols_m1 = len(mcsm_foldx_dfs.columns) print('\n\nResult of first merge:', mcsm_foldx_dfs.shape , '\n===================================================================') +mcsm_foldx_dfs[merging_cols_m1].apply(len) +mcsm_foldx_dfs[merging_cols_m1].apply(len) == len(mcsm_foldx_dfs) #%%============================================================================ print('===================================' , '\nSecond merge: dssp + kd' @@ -183,6 +201,8 @@ ncols_m3 = len(dssp_kd_rd_dfs.columns) print('\n\nResult of Third merge:', dssp_kd_rd_dfs.shape , '\n===================================================================') +dssp_kd_rd_dfs[merging_cols_m3].apply(len) +dssp_kd_rd_dfs[merging_cols_m3].apply(len) == len(dssp_kd_rd_dfs) #%%============================================================================ print('=======================================' , '\nFourth merge: First merge + Third merge' @@ -203,12 +223,16 @@ else: print('\nResult of Fourth merge:', combined_df.shape , '\n===================================================================') +combined_df[merging_cols_m4].apply(len) +combined_df[merging_cols_m4].apply(len) == len(combined_df) #%%============================================================================ # OR merges: TEDIOUSSSS!!!! - -#%%RRRR +del(mcsm_df, foldx_df, mcsm_foldx_dfs, dssp_kd_dfs, dssp_kd_rd_dfs,rd_df, kd_df, infile_mcsm, infile_foldx, infile_dssp, infile_kd) +del(merging_cols_m1, merging_cols_m2, merging_cols_m3, merging_cols_m4) +del(in_filename_dssp, in_filename_foldx, in_filename_kd, in_filename_mcsm, in_filename_rd) +#%% print('===================================' , '\nFifth merge: afor_df + afor_kin_df' , '\n===================================') @@ -220,8 +244,6 @@ afor_df = pd.read_csv(infile_afor, sep = ',') afor_kin_df = pd.read_csv(infile_afor_kin, sep = ',') afor_kin_df.columns = afor_kin_df.columns.str.lower() - - merging_cols_m5 = detect_common_cols(afor_df, afor_kin_df) print('Dim of afor_df:', afor_df.shape @@ -230,7 +252,7 @@ print('Dim of afor_df:', afor_df.shape # finding if ALL afor_kin_df muts are present in afor_df # i.e all kinship muts should be PRESENT in mycalcs_present if len(afor_kin_df[afor_kin_df['mutation'].isin(afor_df['mutation'])]) == afor_kin_df.shape[0]: - print('PASS: ALL or_kinship muts are present in my or list') + print('PASS: ALL', len(afor_kin_df), 'or_kinship muts are present in my or list') else: nf_muts = len(afor_kin_df[~afor_kin_df['mutation'].isin(afor_df['mutation'])]) nf_muts_df = afor_kin_df[~afor_kin_df['mutation'].isin(afor_df['mutation'])] @@ -241,10 +263,10 @@ else: # Now checking how many afor_df muts are NOT present in afor_kin_df common_muts = len(afor_df[afor_df['mutation'].isin(afor_kin_df['mutation'])]) -extra_muts_myor = afor_kin_df.shape[0] - common_muts +extra_muts_myor = afor_kin_df.shape[0] - common_muts print('==========================================' - , '\nmy or calcs', extra_muts_myor, 'extra mutation\n' + , '\nmy or calcs has', common_muts, 'present in af_or_kin_df' , '\n==========================================') print('Expected cals for merging with outer_join...') @@ -252,23 +274,29 @@ print('Expected cals for merging with outer_join...') expected_rows = afor_df.shape[0] + extra_muts_myor expected_cols = afor_df.shape[1] + afor_kin_df.shape[1] - len(merging_cols_m5) + +afor_df['mutation'] +afor_kin_df['mutation'] + ors_df = pd.merge(afor_df, afor_kin_df, on = merging_cols_m5, how = o_join) if ors_df.shape[0] == expected_rows and ors_df.shape[1] == expected_cols: - print('PASS: OR dfs successfully combined! PHEWWWW!') + print('PASS but with duplicate muts: OR dfs successfully combined! PHEWWWW!' + , '\nDuplicate muts present but with different \'ref\' and \'alt\' alleles') else: print('FAIL: could not combine OR dfs' , '\nCheck expected rows and cols calculation and join type') print('Dim of merged ors_df:', ors_df.shape) + +ors_df[merging_cols_m5].apply(len) +ors_df[merging_cols_m5].apply(len) == len(ors_df) + #%%============================================================================ # formatting ors_df - ors_df.columns - # Dropping unncessary columns: already removed in ealier preprocessing -#cols_to_drop = ['reference_allele', 'alternate_allele', 'symbol' ] cols_to_drop = ['n_miss'] print('Dropping', len(cols_to_drop), 'columns:\n' , cols_to_drop) @@ -283,18 +311,13 @@ column_order = ['mutation' , 'wild_type' , 'position' , 'mutant_type' - #, 'chr_num_allele' #old , 'ref_allele' , 'alt_allele' , 'mut_info_f1' , 'mut_info_f2' , 'mut_type' , 'gene_id' - #, 'gene_number' #old , 'gene_name' - #, 'mut_region' - #, 'reference_allele' - #, 'alternate_allele' , 'chromosome_number' , 'af' , 'af_kin' @@ -317,14 +340,9 @@ column_order = ['mutation' , 'se_kin' , 'zval_logistic' , 'logl_h1_kin' - , 'l_remle_kin' - #, 'wt_3let' # old - #, 'mt_3let' # old - #, 'symbol' - #, 'n_miss' - ] + , 'l_remle_kin'] -if ( (len(column_order) == ors_df.shape[1]) and (DataFrame(column_order).isin(ors_df.columns).all().all()): +if ( (len(column_order) == ors_df.shape[1]) and (DataFrame(column_order).isin(ors_df.columns).all().all())): print('PASS: Column order generated for all:', len(column_order), 'columns' , '\nColumn names match, safe to reorder columns' , '\nApplying column order to df...' ) @@ -337,6 +355,61 @@ else: print('\nResult of Sixth merge:', ors_df_ordered.shape , '\n===================================================================') +#%% +ors_df_ordered.shape +check = ors_df_ordered[['mutationinformation','mutation', 'wild_type', 'position', 'mutant_type']] + +# populating 'nan' info +lookup_dict = dict() +for k, v in low_3letter_dict.items(): + lookup_dict[k] = v['one_letter_code'] + #print(lookup_dict) + + wt = ors_df_ordered['mutation'].str.extract(wt_regex).squeeze() + #print(wt) + ors_df_ordered['wild_type'] = wt.map(lookup_dict) + + ors_df_ordered['position'] = ors_df_ordered['mutation'].str.extract(pos_regex) + + mt = ors_df_ordered['mutation'].str.extract(mut_regex).squeeze() + ors_df_ordered['mutant_type'] = mt.map(lookup_dict) + +ors_df_ordered['mutationinformation'] = ors_df_ordered['wild_type'] + ors_df_ordered.position.map(str) + ors_df_ordered['mutant_type'] +check = ors_df_ordered[['mutationinformation','mutation', 'wild_type', 'position', 'mutant_type']] + +# populate mut_info_f1 +ors_df_ordered['mut_info_f1'].isna().sum() +ors_df_ordered['mut_info_f1'] = ors_df_ordered['position'].astype(str) + ors_df_ordered['wild_type'] + '>' + ors_df_ordered['position'].astype(str) + ors_df_ordered['mutant_type'] +ors_df_ordered['mut_info_f1'].isna().sum() + +# populate mut_info_f2 +ors_df_ordered['mut_info_f2'] = ors_df_ordered['mutation'].str.replace(gene_match.lower(), 'p.', regex = True) + +# populate mut_type +ors_df_ordered['mut_type'].isna().sum() +#mut_type_word = ors_df_ordered['mut_type'].value_counts() +mut_type_word = 'missense' # FIXME, should be derived +ors_df_ordered['mut_type'].fillna(mut_type_word, inplace = True) +ors_df_ordered['mut_type'].isna().sum() + +# populate gene_id +ors_df_ordered['gene_id'].isna().sum() +#gene_id_word = ors_df_ordered['gene_id'].value_counts() +gene_id_word = 'Rv2043c' # FIXME, should be derived +ors_df_ordered['gene_id'].fillna(gene_id_word, inplace = True) +ors_df_ordered['gene_id'].isna().sum() + +# populate gene_name +ors_df_ordered['gene_name'].isna().sum() +ors_df_ordered['gene_name'].value_counts() +ors_df_ordered['gene_name'].fillna(gene, inplace = True) +ors_df_ordered['gene_name'].isna().sum() + +# check numbers +ors_df_ordered['or_kin'].isna().sum() +# should be 0 +ors_df_ordered['or_mychisq'].isna().sum() + #%%============================================================================ print('===================================' , '\nSixth merge: Fourth + Fifth merge' @@ -344,53 +417,159 @@ print('===================================' , '\n===================================') #combined_df_all = combine_dfs_with_checks(combined_df, ors_df_ordered, my_join = i_join) -merging_cols_m6 = detect_common_cols(combined_df, ors_df_ordered) +merging_cols_m6 = detect_common_cols(combined_df, ors_df_ordered) + +# dtype problems +if len(merging_cols_m6) > 1 and 'position'in merging_cols_m6: + print('Removing \'position\' from merging_cols_m6 to make dtypes consistent' + , '\norig length of merging_cols_m6:', len(merging_cols_m6)) + merging_cols_m6.remove('position') + print('\nlength after removing:', len(merging_cols_m6)) + print('Dim of df1:', combined_df.shape , '\nDim of df2:', ors_df_ordered.shape , '\nNo. of merging_cols:', len(merging_cols_m6)) print('Checking mutations in the two dfs:' - , '\nmuts in df1 but NOT in df2:' + , '\nmuts in df1 present in df2:' , combined_df['mutationinformation'].isin(ors_df_ordered['mutationinformation']).sum() - , '\nmuts in df2 but NOT in df1:' + , '\nmuts in df2 present in df1:' , ors_df_ordered['mutationinformation'].isin(combined_df['mutationinformation']).sum()) -#print('\nNo. of common muts:', np.intersect1d(combined_df['mutationinformation'], ors_df_ordered['mutationinformation']) ) - -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -combined_df_all = pd.merge(combined_df, ors_df, on = merging_cols_m6, how = l_join) +#---------- +# merge 6 +#---------- +combined_df_all = pd.merge(combined_df, ors_df_ordered, on = merging_cols_m6, how = o_join) combined_df_all.shape -# FIXME: DIM -# only with left join! -outdf_expected_rows = len(combined_df) +# sanity check for merge 6 +outdf_expected_rows = len(combined_df) + extra_muts_myor +unique_muts = len(combined_df) outdf_expected_cols = len(combined_df.columns) + len(ors_df_ordered.columns) - len(merging_cols_m6) -#if combined_df_all.shape[1] == outdf_expected_cols and combined_df_all.shape[0] == outdf_expected_rows: -if combined_df_all.shape[1] == outdf_expected_cols and combined_df_all['mutationinformation'].nunique() == outdf_expected_rows: +if combined_df_all.shape[0] == outdf_expected_rows and combined_df_all.shape[1] == outdf_expected_cols and combined_df_all['mutationinformation'].nunique() == unique_muts: print('PASS: Df dimension match' - , '\nDim of combined_df_all with join type:', l_join + , '\ncombined_df_all with join type:', o_join , '\n', combined_df_all.shape , '\n===============================================================') else: print('FAIL: Df dimension mismatch' , 'Cannot generate expected dim. See details of merge performed' , '\ndf1 dim:', combined_df.shape - , '\ndf2 dim:', ors_df.shape + , '\ndf2 dim:', ors_df_ordered.shape , '\nGot:', combined_df_all.shape , '\nmuts in df1 but NOT in df2:' - , combined_df['mutationinformation'].isin(ors_df['mutationinformation']).sum() + , combined_df['mutationinformation'].isin(ors_df_ordered['mutationinformation']).sum() , '\nmuts in df2 but NOT in df1:' , ors_df['mutationinformation'].isin(combined_df['mutationinformation']).sum()) sys.exit() -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -# nan in mutation col -# FIXME: should get fixmed with JP's resolved dataset!? -combined_df_all['mutation'].isna().sum() -baz = combined_df_all[combined_df_all['mutation'].isna()] + +# drop extra cols +all_cols = combined_df_all.columns + +#pos_cols_check = combined_df_all[['position_x','position_y']] +c = combined_df_all[['position_x','position_y']].isna().sum() +pos_col_to_drop = c.index[c>0].to_list() +cols_to_drop = pos_col_to_drop + ['wild_type_kd'] + +print('Dropping', len(cols_to_drop), 'columns:\n', cols_to_drop) +combined_df_all.drop(cols_to_drop, axis = 1, inplace = True) + +# rename position_x to position +pos_col_to_rename = c.index[c==0].to_list() +combined_df_all.shape +combined_df_all.rename(columns = { pos_col_to_rename[0]: 'position'}, inplace = True) +combined_df_all.shape + +all_cols = combined_df_all.columns + +#%% reorder cols to for convenience +first_cols = ['mutationinformation','mutation', 'wild_type', 'position', 'mutant_type'] +last_cols = [col for col in combined_df_all.columns if col not in first_cols] + +combined_df_all = combined_df_all[first_cols+last_cols] + +#%% IMPORTANT: check if mutation related info is all populated after this merge +# select string colnames to ensure no NA exist there +string_cols = combined_df_all.columns[combined_df_all.applymap(lambda x: isinstance(x, str)).all(0)] + +if (combined_df_all[string_cols].isna().sum(axis = 0)== 0).all(): + print('PASS: All string cols are populated with no NAs') +else: + print('FAIL: NAs detected in string cols') + print(combined_df_all[string_cols].isna().sum(axis = 0)) + sys.exit() + +# relevant mut cols +check_mut_cols = merging_cols_m5 + merging_cols_m6 + +count_na_mut_cols = combined_df_all[check_mut_cols].isna().sum().reset_index().rename(columns = {'index': 'col_name', 0: 'na_count'}) +print(check_mut_cols) + +c2 = combined_df_all[check_mut_cols].isna().sum() +missing_info_cols = c2.index[c2>0].to_list() + +if c2.sum()>0: + #na_muts_n = combined_df_all['mutation'].isna().sum() + na_muts_n = combined_df_all[missing_info_cols].isna().sum() + print(na_muts_n.values[0], 'mutations have missing \'mutation\' info.' + , '\nFetching these from reference dict...') +else: + print('No missing \'mutation\' has been detected!') + +lookup_dict = dict() +for k, v in oneletter_aa_dict.items(): + lookup_dict[k] = v['three_letter_code_lower'] + print(lookup_dict) + wt_3let = combined_df_all['wild_type'].map(lookup_dict) + #print(wt_3let) + pos = combined_df_all['position'].astype(str) + #print(pos) + mt_3let = combined_df_all['mutant_type'].map(lookup_dict) + #print(mt_3let) + # override the 'mutation' column + combined_df_all['mutation'] = 'pnca_p.' + wt_3let + pos + mt_3let + print(combined_df_all['mutation']) + +# check again +if combined_df_all[missing_info_cols].isna().sum().all() == 0: + print('PASS: No mutations have missing \'mutation\' info.') +else: + print('FAIL:', combined_df_all[missing_info_cols].isna().sum().values[0] + , '\nmutations have missing info STILL...') + sys.exit() + +#%% check +foo = combined_df_all.drop_duplicates('mutationinformation') +foo2 = combined_df_all.drop_duplicates('mutation') +if foo.equals(foo2): + print('PASS: Dropping mutation or mutatationinformation has the same effect\n') +else: + print('FAIL: Still problems in merged data') + sys.exit() + #%%============================================================================ output_cols = combined_df_all.columns -print('Output cols:', output_cols) + +#%% IMPORTANT result info +if combined_df_all['or_mychisq'].isna().sum() == len(combined_df) - len(afor_df): + print('PASS: No. of NA in or_mychisq matches expected length' + , '\nNo. of with NA in or_mychisq:', combined_df_all['or_mychisq'].isna().sum() + , '\nNo. of NA in or_kin:', combined_df_all['or_kin'].isna().sum()) +else: + print('FAIL: No. of NA in or_mychisq does not match expected length') + + +if combined_df_all.shape[0] == outdf_expected_rows: + print('\nINFORMARIONAL ONLY: combined_df_all has duplicate muts present but with unique ref and alt allele' + , '\n=============================================================') +else: + print('combined_df_all has no duplicate muts present' + ,'\n===============================================================') + +print('\nDim of combined_data:', combined_df_all.shape + , '\nNo. of unique mutations:', combined_df_all['mutationinformation'].nunique()) + #%%============================================================================ # write csv diff --git a/scripts/data_extraction.py b/scripts/data_extraction.py index 2f3531a..66f57e1 100755 --- a/scripts/data_extraction.py +++ b/scripts/data_extraction.py @@ -45,8 +45,6 @@ Created on Tue Aug 6 12:56:03 2019 #5. chain #6. wild_pos #7. wild_chain_pos - - #======================================================================= #%% load libraries import os, sys @@ -83,16 +81,16 @@ gene = args.gene gene_match = gene + '_p.' print('mut pattern for gene', gene, ':', gene_match) -nssnp_match = gene_match +'[A-Z]{3}[0-9]+[A-Z]{3}' +nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}' print('nsSNP for gene', gene, ':', nssnp_match) -wt_regex = gene_match.lower()+'(\w{3})' +wt_regex = gene_match.lower()+'([A-Za-z]{3})' print('wt regex:', wt_regex) -mut_regex = r'\d+(\w{3})$' +mut_regex = r'[0-9]+(\w{3})$' print('mt regex:', mut_regex) -pos_regex = r'(\d+)' +pos_regex = r'([0-9]+)' print('position regex:', pos_regex) # building cols to extract @@ -156,30 +154,29 @@ if in_filename_master == 'original_tanushree_data_v2.csv': else: core_cols = ['id' , 'sample' - , 'patient_id' - , 'strain' + #, 'patient_id' + #, 'strain' , 'lineage' , 'sublineage' - , 'country' + #, 'country' , 'country_code' , 'geographic_source' #, 'region' - , 'location' - , 'host_body_site' - , 'environment_material' - , 'host_status' - , 'host_sex' - , 'submitted_host_sex' - , 'hiv_status' - , 'HIV_status' - , 'tissue_type' - , 'isolation_source' + #, 'location' + #, 'host_body_site' + #, 'environment_material' + #, 'host_status' + #, 'host_sex' + #, 'submitted_host_sex' + #, 'hiv_status' + #, 'HIV_status' + #, 'tissue_type' + #, 'isolation_source' , resistance_col] variable_based_cols = [drug , dr_muts_col , other_muts_col] - #, resistance_col] cols_to_extract = core_cols + variable_based_cols print('Extracting', len(cols_to_extract), 'columns from master data') @@ -202,7 +199,7 @@ print('No. of NAs/column:' + '\n', meta_data.isna().sum() #%% Write check file check_file = outdir + '/' + gene.lower() + '_gwas.csv' -meta_data.to_csv(check_file) +meta_data.to_csv(check_file, index = False) print('Writing subsetted gwas data' , '\nFile', check_file , '\nDim:', meta_data.shape) @@ -217,9 +214,9 @@ print('Writing subsetted gwas data' # drug counts: complete samples for OR calcs meta_data[drug].value_counts() print('===========================================================\n' - , 'RESULT: No. of Sus and Res samples:\n', meta_data[drug].value_counts() + , 'RESULT: No. of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts() , '\n===========================================================\n' - , 'RESULT: Percentage of Sus and Res samples:\n', meta_data[drug].value_counts(normalize = True)*100 + , 'RESULT: Percentage of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts(normalize = True)*100 , '\n===========================================================') #%% @@ -1173,7 +1170,7 @@ del(out_filename_metadata_poscounts) #%% Write file: gene_metadata (i.e gene_LF1) # where each row has UNIQUE mutations NOT unique sample ids -out_filename_metadata = gene.lower() + '_metadata.csv' +out_filename_metadata = gene.lower() + '_metadata_raw.csv' outfile_metadata = outdir + '/' + out_filename_metadata print('Writing file: LF formatted data' , '\nFile:', outfile_metadata diff --git a/scripts/ks_test_PS.R b/scripts/ks_test_PS.R new file mode 100644 index 0000000..bc11957 --- /dev/null +++ b/scripts/ks_test_PS.R @@ -0,0 +1,211 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: KS test for PS/DUET lineage distributions +#======================================================================= +#======================================================================= +# working dir and loading libraries +getwd() +setwd("~/git/LSHTM_analysis/scripts/") +getwd() + +#source("/plotting/Header_TT.R") +#source("../barplot_colour_function.R") +#require(data.table) +source("plotting/combining_dfs_plotting.R") +# should return the following dfs, directories and variables + +# PS combined: +# 1) merged_df2 +# 2) merged_df2_comp +# 3) merged_df3 +# 4) merged_df3_comp + +# LIG combined: +# 5) merged_df2_lig +# 6) merged_df2_comp_lig +# 7) merged_df3_lig +# 8) merged_df3_comp_lig + +# 9) my_df_u +# 10) my_df_u_lig + +cat(paste0("Directories imported:" + , "\ndatadir:", datadir + , "\nindir:", indir + , "\noutdir:", outdir + , "\nplotdir:", plotdir)) + +cat(paste0("Variables imported:" + , "\ndrug:", drug + , "\ngene:", gene + , "\ngene_match:", gene_match + , "\nAngstrom symbol:", angstroms_symbol + , "\nNo. of duplicated muts:", dup_muts_nu + , "\nNA count for ORs:", na_count + , "\nNA count in df2:", na_count_df2 + , "\nNA count in df3:", na_count_df3)) + +########################### +# Data for stats +# you need merged_df2 or merged_df2_comp +# since this is one-many relationship +# i.e the same SNP can belong to multiple lineages +# using the _comp dataset means +# we lose some muts and at this level, we should use +# as much info as available, hence use df with NA +########################### + +# REASSIGNMENT +my_df = merged_df2 + +# delete variables not required +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) + +# quick checks +colnames(my_df) +str(my_df) + +# Ensure correct data type in columns to plot: need to be factor +is.factor(my_df$lineage) +my_df$lineage = as.factor(my_df$lineage) +is.factor(my_df$lineage) + +table(my_df$mutation_info); str(my_df$mutation_info) + +# subset df with dr muts only +my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide") +table(my_df_dr$mutation_info) + +# stats for all muts and dr_muts +# 1) for all muts +# 2) for dr_muts +#=========================== +table(my_df$lineage); str(my_df$lineage) +table(my_df_dr$lineage); str(my_df_dr$lineage) + +# subset only lineages1-4 +sel_lineages = c("lineage1" + , "lineage2" + , "lineage3" + , "lineage4") + +# subset and refactor: all muts +df_lin = subset(my_df, subset = lineage %in% sel_lineages) +df_lin$lineage = factor(df_lin$lineage) + +# subset and refactor: dr muts +df_lin_dr = subset(my_df_dr, subset = lineage %in% sel_lineages) +df_lin_dr$lineage = factor(df_lin_dr$lineage) + + +#{RESULT: No of samples within lineage} +table(df_lin$lineage) +table(df_lin_dr$lineage) + +#{Result: No. of unique mutations the 4 lineages contribute to} +length(unique(df_lin$mutationinformation)) +length(unique(df_lin_dr$mutationinformation)) + + +# COMPARING DISTRIBUTIONS +#================ +# ALL mutations +#================= +head(df_lin$lineage) +df_lin$lineage = as.character(df_lin$lineage) + +lin1 = df_lin[df_lin$lineage == "lineage1",]$duet_scaled +lin2 = df_lin[df_lin$lineage == "lineage2",]$duet_scaled +lin3 = df_lin[df_lin$lineage == "lineage3",]$duet_scaled +lin4 = df_lin[df_lin$lineage == "lineage4",]$duet_scaled + +# ks test +lin12 = ks.test(lin1,lin2) +lin12_df = as.data.frame(cbind(lin12$data.name, lin12$p.value)) + +lin13 = ks.test(lin1,lin3) +lin13_df = as.data.frame(cbind(lin13$data.name, lin13$p.value)) + +lin14 = ks.test(lin1,lin4) +lin14_df = as.data.frame(cbind(lin14$data.name, lin14$p.value)) + +lin23 = ks.test(lin2,lin3) +lin23_df = as.data.frame(cbind(lin23$data.name, lin23$p.value)) + +lin24 = ks.test(lin2,lin4) +lin24_df = as.data.frame(cbind(lin24$data.name, lin24$p.value)) + +lin34 = ks.test(lin3,lin4) +lin34_df = as.data.frame(cbind(lin34$data.name, lin34$p.value)) + +ks_results_all = rbind(lin12_df + , lin13_df + , lin14_df + , lin23_df + , lin24_df + , lin34_df) + +#p-value < 2.2e-16 +rm(lin12, lin12_df + , lin13, lin13_df + , lin14, lin14_df + , lin23, lin23_df + , lin24, lin24_df + , lin34, lin34_df) + +#================ +# DRUG mutations +#================= +head(df_lin_dr$lineage) +df_lin_dr$lineage = as.character(df_lin_dr$lineage) + +lin1_dr = df_lin_dr[df_lin_dr$lineage == "lineage1",]$duet_scaled +lin2_dr = df_lin_dr[df_lin_dr$lineage == "lineage2",]$duet_scaled +lin3_dr = df_lin_dr[df_lin_dr$lineage == "lineage3",]$duet_scaled +lin4_dr = df_lin_dr[df_lin_dr$lineage == "lineage4",]$duet_scaled + +# ks test: dr muts +lin12_dr = ks.test(lin1_dr,lin2_dr) +lin12_df_dr = as.data.frame(cbind(lin12_dr$data.name, lin12_dr$p.value)) + +lin13_dr = ks.test(lin1_dr,lin3_dr) +lin13_df_dr = as.data.frame(cbind(lin13_dr$data.name, lin13_dr$p.value)) + +lin14_dr = ks.test(lin1_dr,lin4_dr) +lin14_df_dr = as.data.frame(cbind(lin14_dr$data.name, lin14_dr$p.value)) + +lin23_dr = ks.test(lin2_dr,lin3_dr) +lin23_df_dr = as.data.frame(cbind(lin23_dr$data.name, lin23_dr$p.value)) + +lin24_dr = ks.test(lin2_dr,lin4_dr) +lin24_df_dr = as.data.frame(cbind(lin24_dr$data.name, lin24_dr$p.value)) + +lin34_dr = ks.test(lin3_dr,lin4_dr) +lin34_df_dr = as.data.frame(cbind(lin34_dr$data.name, lin34_dr$p.value)) + +ks_results_dr = rbind(lin12_df_dr + , lin13_df_dr + , lin14_df_dr + , lin23_df_dr + , lin24_df_dr + , lin34_df_dr) + +ks_results_combined = cbind(ks_results_all, ks_results_dr) + +my_colnames = c("Lineage_comparisons" + , paste0("All_mutations n=", nrow(df_lin)) + , paste0("Drug_associated_mutations n=", nrow(df_lin_dr))) +my_colnames + +# select the output columns +ks_results_combined_f = ks_results_combined[,c(1,2,4)] + +colnames(ks_results_combined_f) = my_colnames +ks_results_combined_f + +#============= +# write output file +#============= +ks_results = paste0(outdir,"/results/ks_results.csv") +write.csv(ks_results_combined_f, ks_results, row.names = F) + diff --git a/scripts/or_kinship_link.py b/scripts/or_kinship_link.py index b1a00ce..17451fb 100755 --- a/scripts/or_kinship_link.py +++ b/scripts/or_kinship_link.py @@ -104,7 +104,7 @@ or_df.columns #%% snp_info file: master and gene specific ones # gene info -info_df2 = pd.read_csv(gene_info, sep = '\t', header = 0) #447, 10 +info_df2 = pd.read_csv(gene_info, sep = '\t', header = 0) #447, 11 #info_df2 = pd.read_csv(gene_info, sep = ',', header = 0) #447 10 mis_mut_cover = (info_df2['chromosome_number'].nunique()/info_df2['chromosome_number'].count()) * 100 print('*****RESULT*****' @@ -212,7 +212,7 @@ else: #PENDING Jody's reply # !!!!!!!! -# drop nan from dfm2_mis as these are not useful +# drop nan from dfm2_mis as these are not useful and JP confirmed the same print('Dropping NAs before further processing...') dfm2_mis = dfm2[dfm2['mut_type'].notnull()] # !!!!!!!! diff --git a/scripts/plotting/autoviz.py b/scripts/plotting/autoviz.py deleted file mode 100644 index 8dc6405..0000000 --- a/scripts/plotting/autoviz.py +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env python3 -#======================================================================= -#%% useful links -#https://towardsdatascience.com/autoviz-automatically-visualize-any-dataset-ba2691a8b55a -#https://pypi.org/project/autoviz/ -#======================================================================= -import os, sys -import pandas as pd -import numpy as np -import re -import argparse -from autoviz.AutoViz_Class import AutoViz_Class - -homedir = os.path.expanduser('~') -os.chdir(homedir + '/git/LSHTM_analysis/scripts') -#%%============================================================================ -# variables -gene = 'pncA' -drug = 'pyrazinamide' - -#%%============================================================================ -#============== -# directories -#============== -datadir = homedir + '/' + 'git/Data' - -indir = datadir + '/' + drug + '/input' - -outdir = datadir + '/' + drug + '/output' - -#======= -# input -#======= -in_filename_plotting = 'car_design.csv' -in_filename_plotting = gene.lower() + '_all_params.csv' -infile_plotting = outdir + '/' + in_filename_plotting -print('plotting file: ', infile_plotting - , '\n============================================================') -#======================================================================= -plotting_df = pd.read_csv(infile_plotting, sep = ',') -#Instantiate the AutoViz class -AV = AutoViz_Class() -df = AV.AutoViz(infile_plotting) -#df2 = AV.AutoViz(plotting_df) -plotting_df.columns[~plotting_df.columns.isin(df.columns)] diff --git a/scripts/plotting/barplots_subcolours_aa_PS.R b/scripts/plotting/barplots_subcolours_aa_PS.R index 5850abf..7c0a3aa 100755 --- a/scripts/plotting/barplots_subcolours_aa_PS.R +++ b/scripts/plotting/barplots_subcolours_aa_PS.R @@ -1,5 +1,4 @@ -#!/usr/bin/env Rscript -getwd() +#!/usr/bin/env Rscript getwd() setwd("~/git/LSHTM_analysis/scripts/plotting") getwd() @@ -19,8 +18,7 @@ getwd() library(ggplot2) library(data.table) source("barplot_colour_function.R") -#source("subcols_axis.R") -source("subcols_axis_PS.R") +source("subcols_axis.R") # should return the following dfs, directories and variables # mut_pos_cols @@ -161,9 +159,7 @@ min(df$duet_scaled) max(df$duet_scaled) # sanity checks -# very important!!!! tapply(df$duet_scaled, df$duet_outcome, min) - tapply(df$duet_scaled, df$duet_outcome, max) # My colour FUNCTION: based on group and subgroup @@ -241,7 +237,7 @@ outPlot = g + , axis.ticks.x = element_blank()) + labs(title = "" #title = my_title - , x = "position" + , x = "Position" , y = "Frequency") print(outPlot) diff --git a/scripts/plotting/columns_all_params.csv b/scripts/plotting/columns_all_params.csv deleted file mode 100644 index fb8adc3..0000000 --- a/scripts/plotting/columns_all_params.csv +++ /dev/null @@ -1,87 +0,0 @@ -,x,,changes, -1,mutationinformation,,Mutationinformation, -2,wild_type,,,consider...wild_aa -3,position,,Position, -4,mutant_type,,,consider...mutant_aa -5,chain,,, -6,ligand_id,,, -7,ligand_distance,,, -8,duet_stability_change,,, -9,duet_outcome,,DUET_outcome, -10,ligand_affinity_change,,, -11,ligand_outcome,,Lig_outcome, -12,duet_scaled,,ratioDUET, -13,affinity_scaled,,ratioPredAff, -14,wild_pos,,WildPos, -15,wild_chain_pos,,, -16,ddg,,, -17,contacts,,, -18,electro_rr,,, -19,electro_mm,,, -20,electro_sm,,, -21,electro_ss,,, -22,disulfide_rr,,, -23,disulfide_mm,,, -24,disulfide_sm,,, -25,disulfide_ss,,, -26,hbonds_rr,,, -27,hbonds_mm,,, -28,hbonds_sm,,, -29,hbonds_ss,,, -30,partcov_rr,,, -31,partcov_mm,,, -32,partcov_sm,,, -33,partcov_ss,,, -34,vdwclashes_rr,,, -35,vdwclashes_mm,,, -36,vdwclashes_sm,,, -37,vdwclashes_ss,,, -38,volumetric_rr,,, -39,volumetric_mm,,, -40,volumetric_sm,,, -41,volumetric_ss,,, -42,wild_type_dssp,,, -43,asa,,, -44,rsa,,, -45,ss,,, -46,ss_class,,, -47,chain_id,,, -48,wild_type_kd,,, -49,kd_values,,, -50,rd_values,,, -51,wt_3letter_caps,,, -52,mutation,,, -53,af,,, -54,beta_logistic,,, -55,or_logistic,,, -56,pval_logistic,,, -57,se_logistic,,, -58,zval_logistic,,, -59,ci_low_logistic,,, -60,ci_hi_logistic,,, -61,or_mychisq,,, -62,or_fisher,,, -63,pval_fisher,,, -64,ci_low_fisher,,, -65,ci_hi_fisher,,, -66,est_chisq,,, -67,pval_chisq,,, -68,chromosome_number,,, -69,ref_allele,,, -70,alt_allele,,, -71,mut_type,,, -72,gene_id,,, -73,gene_number,,, -74,mut_region,,, -75,mut_info,,, -76,chr_num_allele,,, -77,wt_3let,,, -78,mt_3let,,, -79,af_kin,,, -80,or_kin,,, -81,pwald_kin,,, -82,beta_kin,,, -83,se_kin,,, -84,logl_h1_kin,,, -85,l_remle_kin,,, -86,n_miss,,, diff --git a/scripts/plotting/combining_dfs_plotting.R b/scripts/plotting/combining_dfs_plotting.R index 3bf6262..911b0f8 100644 --- a/scripts/plotting/combining_dfs_plotting.R +++ b/scripts/plotting/combining_dfs_plotting.R @@ -6,6 +6,7 @@ # 2) _meta_data.csv # Output: +# 1) muts with opposite effects on stability # 2) large combined df including NAs for AF, OR,etc # Dim: same no. of rows as gene associated meta_data_with_AFandOR # 3) small combined df including NAs for AF, OR, etc. @@ -36,17 +37,23 @@ source("plotting_data.R") # dup_muts cat("Directories imported:" + , "\n====================" , "\ndatadir:", datadir , "\nindir:", indir , "\noutdir:", outdir , "\nplotdir:", plotdir) cat("Variables imported:" + , "\n=====================" , "\ndrug:", drug , "\ngene:", gene , "\ngene_match:", gene_match - , "\nLength of upos:", length(upos) - , "\nAngstrom symbol:", angstroms_symbol) + , "\nAngstrom symbol:", angstroms_symbol + , "\nNo. of duplicated muts:", dup_muts_nu + , "\ndr_muts_col:", dr_muts_col + , "\nother_muts_col:", other_muts_col + , "\ndrtype_col:", resistance_col) + # clear excess variable rm(my_df, upos, dup_muts) @@ -57,7 +64,6 @@ rm(my_df, upos, dup_muts) #in_file1: output of plotting_data.R # my_df_u - # infile 2: gene associated meta data #in_filename_gene_metadata = paste0(tolower(gene), "_meta_data_with_AFandOR.csv") in_filename_gene_metadata = paste0(tolower(gene), "_metadata.csv") @@ -94,6 +100,8 @@ gene_metadata <- read.csv(infile_gene_metadata , header = T) cat("Dim:", dim(gene_metadata)) +table(gene_metadata$mutation_info) + # counting NAs in AF, OR cols # or_mychisq @@ -124,6 +132,7 @@ if (identical(sum(is.na(my_df_u$or_kin)) , "\nNA in AF:", sum(is.na(my_df_u$af_kin))) } +str(gene_metadata) ################################################################### # combining: PS @@ -145,7 +154,7 @@ merging_cols = intersect(colnames(my_df_u), colnames(gene_metadata)) cat(paste0("Merging dfs with NAs: big df (1-many relationship b/w id & mut)" , "\nNo. of merging cols:", length(merging_cols) - , "\nMerging columns identified:\n")) + , "\nMerging columns identified:")) print(merging_cols) # important checks! @@ -160,7 +169,7 @@ merged_df2 = merge(x = gene_metadata , by = merging_cols , all.y = T) -cat("Dim of merged_df2: ", dim(merged_df2), "\n") +cat("Dim of merged_df2: ", dim(merged_df2)) head(merged_df2$position) # sanity check @@ -170,10 +179,10 @@ if(nrow(gene_metadata) == nrow(merged_df2)){ ,"\nExpected no. of rows: ",nrow(gene_metadata) ,"\nGot no. of rows: ", nrow(merged_df2)) } else{ - cat("\nFAIL: nrow(merged_df2)!= nrow(gene associated gene_metadata)" + cat("FAIL: nrow(merged_df2)!= nrow(gene associated gene_metadata)" , "\nExpected no. of rows after merge: ", nrow(gene_metadata) , "\nGot no. of rows: ", nrow(merged_df2) - , "\nFinding discrepancy\n") + , "\nFinding discrepancy") merged_muts_u = unique(merged_df2$mutationinformation) meta_muts_u = unique(gene_metadata$mutationinformation) # find the index where it differs @@ -227,6 +236,7 @@ if (identical(sum(is.na(merged_df3$or_kin)) , "\nNA in AF:", sum(is.na(merged_df3$af_kin))) } + #========================= # Merge3: merged_df2_comp # same as merge 1 but excluding NAs from ORs, etc. @@ -237,6 +247,7 @@ cat("Merging dfs without any NAs: big df (1-many relationship b/w id & mut)" na_count_df2 = sum(is.na(merged_df2$af)) merged_df2_comp = merged_df2[!is.na(merged_df2$af),] + # sanity check: no +-1 gymnastics cat("Checking nrows in merged_df2_comp") if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count_df2)){ @@ -246,20 +257,22 @@ if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count_df2)){ , "\nNo. of rows: ", nrow(merged_df2_comp) , "\nNo. of cols: ", ncol(merged_df2_comp)) }else{ - cat("FAIL: No. of rows mismatch" - ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2 - ,"\nGot no. of rows: ", nrow(merged_df2_comp)) + cat("FAIL: No. of rows mismatch" + ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2 + ,"\nGot no. of rows: ", nrow(merged_df2_comp)) } + #========================= # Merge4: merged_df3_comp # same as merge 2 but excluding NAs from ORs, etc or # remove duplicate mutation information #========================= - na_count_df3 = sum(is.na(merged_df3$af)) #merged_df3_comp = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] # a way + merged_df3_comp = merged_df3[!is.na(merged_df3$af),] # another way -cat("\nChecking nrows in merged_df3_comp") +cat("Checking nrows in merged_df3_comp") + if(nrow(merged_df3_comp) == (nrow(merged_df3) - na_count_df3)){ cat("\nPASS: No. of rows match" ,"\nDim of merged_df3_comp: " @@ -267,12 +280,11 @@ if(nrow(merged_df3_comp) == (nrow(merged_df3) - na_count_df3)){ , "\nNo. of rows: ", nrow(merged_df3_comp) , "\nNo. of cols: ", ncol(merged_df3_comp)) }else{ - cat("\nFAIL: No. of rows mismatch" + cat("FAIL: No. of rows mismatch" ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3 - , "\nGot no. of rows: ", nrow(merged_df3_comp)) + ,"\nGot no. of rows: ", nrow(merged_df3_comp)) } - # alternate way of deriving merged_df3_comp foo = merged_df3[!is.na(merged_df3$af),] bar = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] @@ -307,7 +319,8 @@ all.equal(foo, bar) # clear variables rm(foo, bar, gene_metadata , in_filename_params, infile_params, merging_cols - , in_filename_gene_metadata, infile_gene_metadata) + , in_filename_gene_metadata, infile_gene_metadata + , merged_df2v2, merged_df2v3) #************************* ##################################################################### # Combining: LIG @@ -334,4 +347,4 @@ if (nrow(merged_df3_lig) == nrow(my_df_u_lig)){ #========================================================================== # end of script -##========================================================================= +##========================================================================== diff --git a/scripts/plotting/combining_two_df_FIXME.R b/scripts/plotting/combining_two_df_FIXME.R old mode 100755 new mode 100644 diff --git a/scripts/plotting/corr_plots.R b/scripts/plotting/corr_plots.R new file mode 100644 index 0000000..00c2c5a --- /dev/null +++ b/scripts/plotting/corr_plots.R @@ -0,0 +1,263 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: Corr plots for PS and Lig + +# Output: 1 svg + +#======================================================================= +# working dir and loading libraries +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting/") +getwd() + + +source("Header_TT.R") +require(cowplot) +source("combining_dfs_plotting.R") +# should return the following dfs, directories and variables + +# PS combined: +# 1) merged_df2 +# 2) merged_df2_comp +# 3) merged_df3 +# 4) merged_df3_comp + +# LIG combined: +# 5) merged_df2_lig +# 6) merged_df2_comp_lig +# 7) merged_df3_lig +# 8) merged_df3_comp_lig + +# 9) my_df_u +# 10) my_df_u_lig + +cat(paste0("Directories imported:" + , "\ndatadir:", datadir + , "\nindir:", indir + , "\noutdir:", outdir + , "\nplotdir:", plotdir)) + +cat(paste0("Variables imported:" + , "\ndrug:", drug + , "\ngene:", gene + , "\ngene_match:", gene_match + , "\nAngstrom symbol:", angstroms_symbol + , "\nNo. of duplicated muts:", dup_muts_nu + , "\nNA count for ORs:", na_count + , "\nNA count in df2:", na_count_df2 + , "\nNA count in df3:", na_count_df3)) + +#======= +# output +#======= +# can't combine by cowplot because not ggplots +#corr_plot_combined = "corr_combined.svg" +#plot_corr_plot_combined = paste0(plotdir,"/", corr_plot_combined) + +# PS +corr_ps = "corr_PS.svg" +plot_corr_ps = paste0(plotdir,"/", corr_ps) + +# LIG +corr_lig = "corr_LIG.svg" +plot_corr_lig = paste0(plotdir,"/", corr_lig) + +#################################################################### +# end of loading libraries and functions # +######################################################################## + +#%%%%%%%%%%%%%%%%%%%%%%%%% +df_ps = merged_df3_comp +df_lig = merged_df3_comp_lig +#%%%%%%%%%%%%%%%%%%%%%%%%% + +rm( merged_df2, merged_df2_comp, merged_df2_lig, merged_df2_comp_lig, my_df_u, my_df_u_lig) + +######################################################################## +# end of data extraction and cleaning for plots # +######################################################################## + +#=========================== +# Data for Correlation plots:PS +#=========================== +table(df_ps$duet_outcome) + +# adding log cols +df_ps$log10_or_mychisq = log10(df_ps$or_mychisq) +df_ps$neglog_pval_fisher = -log10(df_ps$pval_fisher) + +df_ps$log10_or_kin = log10(df_ps$or_kin) +df_ps$neglog_pwald_kin = -log10(df_ps$pwald_kin) + +# subset data to generate pairwise correlations +cols_to_select = c("duet_scaled" + + , "log10_or_mychisq" + , "neglog_pval_fisher" + + , "log10_or_kin" + , "neglog_pwald_kin" + + , "af" + + , "duet_outcome" + , drug) + +corr_data_ps = df_ps[, cols_to_select] + +dim(corr_data_ps) + +#p_italic = substitute(paste("-Log(", italic('P'), ")"));p_italic +#p_adjusted_italic = substitute(paste("-Log(", italic('P adjusted'), ")"));p_adjusted_italic + +# assign nice colnames (for display) +my_corr_colnames = c("DUET" + + , "Log(OR)" + , "-Log(P)" + + , "Log(OR adjusted)" + , "-Log(P wald)" + + , "AF" + + , "duet_outcome" + , drug) + +length(my_corr_colnames) + +colnames(corr_data_ps) +colnames(corr_data_ps) <- my_corr_colnames +colnames(corr_data_ps) + +#----------------- +# generate corr PS plot +#----------------- +start = 1 +end = which(colnames(corr_data_ps) == drug); end # should be the last column +offset = 1 + +my_corr_ps = corr_data_ps[start:(end-offset)] +head(my_corr_ps) + +#my_cols = c("#f8766d", "#00bfc4") +# deep blue :#007d85 +# deep red: #ae301e + +cat("Corr plot PS:", plot_corr_ps) +svg(plot_corr_ps, width = 15, height = 15) + +OutPlot1 = pairs.panels(my_corr_ps[1:(length(my_corr_ps)-1)] + , method = "spearman" # correlation method + , hist.col = "grey" ##00AFBB + , density = TRUE # show density plots + , ellipses = F # show correlation ellipses + , stars = T + , rug = F + , breaks = "Sturges" + , show.points = T + , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr_ps$duet_outcome))] + , pch = 21 + , jitter = T + #, alpha = .05 + #, points(pch = 19, col = c("#f8766d", "#00bfc4")) + , cex = 3 + , cex.axis = 2.5 + , cex.labels = 2.1 + , cex.cor = 1 + , smooth = F +) + +print(OutPlot1) +dev.off() + +#=========================== +# Data for Correlation plots: LIG +#=========================== +table(df_lig$ligand_outcome) + +df_lig$log10_or_mychisq = log10(df_lig$or_mychisq) +df_lig$neglog_pval_fisher = -log10(df_lig$pval_fisher) + + +df_lig$log10_or_kin = log10(df_lig$or_kin) +df_lig$neglog_pwald_kin = -log10(df_lig$pwald_kin) + + +# subset data to generate pairwise correlations +cols_to_select = c("affinity_scaled" + + , "log10_or_mychisq" + , "neglog_pval_fisher" + + , "log10_or_kin" + , "neglog_pwald_kin" + + , "af" + + , "ligand_outcome" + , drug) + +corr_data_lig = df_lig[, cols_to_select] + + +dim(corr_data_lig) + +# assign nice colnames (for display) +my_corr_colnames = c("Ligand Affinity" + + , "Log(OR)" + , "-Log(P)" + + , "Log(OR adjusted)" + , "-Log(P wald)" + + , "AF" + + , "ligand_outcome" + , drug) + +length(my_corr_colnames) + +colnames(corr_data_lig) +colnames(corr_data_lig) <- my_corr_colnames +colnames(corr_data_lig) + +#----------------- +# generate corr LIG plot +#----------------- + +start = 1 +end = which(colnames(corr_data_lig) == drug); end # should be the last column +offset = 1 + +my_corr_lig = corr_data_lig[start:(end-offset)] +head(my_corr_lig) + +cat("Corr LIG plot:", plot_corr_lig) +svg(plot_corr_lig, width = 15, height = 15) + +OutPlot2 = pairs.panels(my_corr_lig[1:(length(my_corr_lig)-1)] + , method = "spearman" # correlation method + , hist.col = "grey" ##00AFBB + , density = TRUE # show density plots + , ellipses = F # show correlation ellipses + , stars = T + , rug = F + , breaks = "Sturges" + , show.points = T + , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr_lig$ligand_outcome))] + , pch = 21 + , jitter = T + #, alpha = .05 + #, points(pch = 19, col = c("#f8766d", "#00bfc4")) + , cex = 3 + , cex.axis = 2.5 + , cex.labels = 2.1 + , cex.cor = 1 + , smooth = F +) + +print(OutPlot2) +dev.off() +####################################################### diff --git a/scripts/plotting/dirs.R b/scripts/plotting/dirs.R new file mode 100644 index 0000000..54fb0bf --- /dev/null +++ b/scripts/plotting/dirs.R @@ -0,0 +1,56 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: formatting data that will be used for various plots + +# useful links +#https://stackoverflow.com/questions/38851592/r-append-column-in-a-dataframe-with-frequency-count-based-on-two-columns +######################################################### +# working dir and loading libraries +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting") +getwd() + +#source("Header_TT.R") +library(ggplot2) +library(data.table) +library(dplyr) + +require("getopt", quietly = TRUE) #cmd parse arguments +#======================================================== +# command line args +#spec = matrix(c( +# "drug" , "d", 1, "character", +# "gene" , "g", 1, "character" +#), byrow = TRUE, ncol = 4) + +#opt = getopt(spec) + +#drug = opt$druggene = opt$gene + +#if(is.null(drug)|is.null(gene)) { +# stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)") +#} +#======================================================== +# FIXME: change to cmd +#%% variable assignment: input and output paths & filenames +drug = "pyrazinamide" +gene = "pncA" +gene_match = paste0(gene,"_p.") +cat(gene_match) + +#============= +# directories and variables +#============= +datadir = paste0("~/git/Data") +indir = paste0(datadir, "/", drug, "/input") +outdir = paste0("~/git/Data", "/", drug, "/output") +plotdir = paste0("~/git/Data", "/", drug, "/output/plots") + +dr_muts_col = paste0('dr_mutations_', drug) +other_muts_col = paste0('other_mutations_', drug) +resistance_col = "drtype" + +#%%=============================================================== + + + diff --git a/scripts/plotting/lineage_basic_barplot.R b/scripts/plotting/lineage_basic_barplot.R new file mode 100644 index 0000000..e4503d1 --- /dev/null +++ b/scripts/plotting/lineage_basic_barplot.R @@ -0,0 +1,214 @@ +#!/usr/bin/env Rscript +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting/") +getwd() +######################################################### +# TASK: Basic lineage barplot showing numbers + +# Output: Basic barplot with lineage samples and mut count + +########################################################## +# Installing and loading required packages +########################################################## +source("Header_TT.R") +require(data.table) +source("combining_dfs_plotting.R") +# should return the following dfs, directories and variables + +# PS combined: +# 1) merged_df2 +# 2) merged_df2_comp +# 3) merged_df3 +# 4) merged_df3_comp + +# LIG combined: +# 5) merged_df2_lig +# 6) merged_df2_comp_lig +# 7) merged_df3_lig +# 8) merged_df3_comp_lig + +# 9) my_df_u +# 10) my_df_u_lig + +cat("Directories imported:" + , "\n====================" + , "\ndatadir:", datadir + , "\nindir:", indir + , "\noutdir:", outdir + , "\nplotdir:", plotdir) + +cat("Variables imported:" + , "\n=====================" + , "\ndrug:", drug + , "\ngene:", gene + , "\ngene_match:", gene_match + , "\nAngstrom symbol:", angstroms_symbol + , "\nNo. of duplicated muts:", dup_muts_nu + , "\nNA count for ORs:", na_count + , "\nNA count in df2:", na_count_df2 + , "\nNA count in df3:", na_count_df3 + , "\ndr_muts_col:", dr_muts_col + , "\nother_muts_col:", other_muts_col + , "\ndrtype_col:", resistance_col) + + +#=========== +# input +#=========== +# output of combining_dfs_plotting.R + +#======= +# output +#======= +# plot 1 +basic_bp_lineage = "basic_lineage_barplot.svg" +plot_basic_bp_lineage = paste0(plotdir,"/", basic_bp_lineage) + +#======================================================================= +#================ +# Data for plots: +# you need merged_df2, comprehensive one +# since this has one-many relationship +# i.e the same SNP can belong to multiple lineages +#================ +# REASSIGNMENT as necessary +my_df = merged_df2 + +# clear excess variable +rm(merged_df2_comp, merged_df3, merged_df3_comp) + +# quick checks +colnames(my_df) +str(my_df) + +# Ensure correct data type in columns to plot: need to be factor +is.factor(my_df$lineage) +my_df$lineage = as.factor(my_df$lineage) +is.factor(my_df$lineage) + +#========================== +# Plot: Lineage barplot +# x = lineage y = No. of samples +# col = Lineage +# fill = lineage +#============================ +table(my_df$lineage) +as.data.frame(table(my_df$lineage)) + +#============= +# Data for plots +#============= +# REASSIGNMENT +df <- my_df + +rm(my_df) + +# get freq count of positions so you can subset freq<1 +#setDT(df)[, lineage_count := .N, by = .(lineage)] + +#****************** +# generate plot: barplot of mutation by lineage +#****************** +sel_lineages = c("lineage1" + , "lineage2" + , "lineage3" + , "lineage4" + #, "lineage5" + #, "lineage6" + #, "lineage7" + ) + +df_lin = subset(df, subset = lineage %in% sel_lineages) + +# Create df with lineage inform & no. of unique mutations +# per lineage and total samples within lineage +# this is essentially barplot with two y axis + +bar = bar = as.data.frame(sel_lineages) #4, 1 +total_snps_u = NULL +total_samples = NULL + +for (i in sel_lineages){ + #print(i) + curr_total = length(unique(df$id)[df$lineage==i]) + total_samples = c(total_samples, curr_total) + print(total_samples) + + foo = df[df$lineage==i,] + print(paste0(i, "=======")) + print(length(unique(foo$mutationinformation))) + curr_count = length(unique(foo$mutationinformation)) + + total_snps_u = c(total_snps_u, curr_count) +} + +print(total_snps_u) +bar$num_snps_u = total_snps_u +bar$total_samples = total_samples +bar + +#***************** +# generate plot: lineage barplot with two y-axis +#https://stackoverflow.com/questions/13035295/overlay-bar-graphs-in-ggplot2 +#***************** + +y1 = bar$num_snps_u +y2 = bar$total_samples +x = sel_lineages + +to_plot = data.frame(x = x + , y1 = y1 + , y2 = y2) +to_plot + +# FIXME later: will be depricated! +melted = melt(to_plot, id = "x") +melted + + +svg(plot_basic_bp_lineage) + +my_ats = 20 # axis text size +my_als = 22 # axis label size + +g = ggplot(melted, aes(x = x + , y = value + , fill = variable)) + +printFile = g + geom_bar(stat = "identity" + , position = position_stack(reverse = TRUE) + , alpha=.75 + , colour='grey75') + + theme(axis.text.x = element_text(size = my_ats) + , axis.text.y = element_text(size = my_ats + #, angle = 30 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_als + , colour = 'black') + , axis.title.y = element_text(size = my_als + , colour = 'black') + , legend.position = "top" + , legend.text = element_text(size = my_als)) + + #geom_text() + + geom_label(aes(label = value) + , size = 5 + , hjust = 0.5 + , vjust = 0.5 + , colour = 'black' + , show.legend = FALSE + #, check_overlap = TRUE + , position = position_stack(reverse = T)) + + labs(title = '' + , x = '' + , y = "Number" + , fill = 'Variable' + , colour = 'black') + + scale_fill_manual(values = c('grey50', 'gray75') + , name='' + , labels=c('Mutations', 'Total Samples')) + + scale_x_discrete(breaks = c('lineage1', 'lineage2', 'lineage3', 'lineage4') + , labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')) + +print(printFile) +dev.off() diff --git a/scripts/plotting/lineage_dist_combined_PS.R b/scripts/plotting/lineage_dist_combined_PS.R new file mode 100644 index 0000000..f59cf44 --- /dev/null +++ b/scripts/plotting/lineage_dist_combined_PS.R @@ -0,0 +1,301 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: Lineage dist plots: ggridges + +# Output: 2 SVGs for PS stability + +# 1) all muts +# 2) dr_muts + +########################################################## +# Installing and loading required packages +########################################################## +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting/") +getwd() + +source("Header_TT.R") +library(ggridges) +source("combining_dfs_plotting.R") +# PS combined: +# 1) merged_df2 +# 2) merged_df2_comp +# 3) merged_df3 +# 4) merged_df3_comp + +# LIG combined: +# 5) merged_df2_lig +# 6) merged_df2_comp_lig +# 7) merged_df3_lig +# 8) merged_df3_comp_lig + +# 9) my_df_u +# 10) my_df_u_lig + +cat("Directories imported:" + , "\n====================" + , "\ndatadir:", datadir + , "\nindir:", indir + , "\noutdir:", outdir + , "\nplotdir:", plotdir) + +cat("Variables imported:" + , "\n=====================" + , "\ndrug:", drug + , "\ngene:", gene + , "\ngene_match:", gene_match + , "\nAngstrom symbol:", angstroms_symbol + , "\nNo. of duplicated muts:", dup_muts_nu + , "\nNA count for ORs:", na_count + , "\nNA count in df2:", na_count_df2 + , "\nNA count in df3:", na_count_df3 + , "\ndr_muts_col:", dr_muts_col + , "\nother_muts_col:", other_muts_col + , "\ndrtype_col:", resistance_col) + +#======= +# output +#======= +lineage_dist_combined = "lineage_dist_combined_PS.svg" +plot_lineage_dist_combined = paste0(plotdir,"/", lineage_dist_combined) +#======================================================================== + +########################### +# Data for plots +# you need merged_df2 or merged_df2_comp +# since this is one-many relationship +# i.e the same SNP can belong to multiple lineages +# using the _comp dataset means +# we lose some muts and at this level, we should use +# as much info as available, hence use df with NA +########################### +# REASSIGNMENT +my_df = merged_df2 + +# delete variables not required +rm(my_df_u, merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) + +# quick checks +colnames(my_df) +str(my_df) + +# Ensure correct data type in columns to plot: need to be factor +is.factor(my_df$lineage) +my_df$lineage = as.factor(my_df$lineage) +is.factor(my_df$lineage) + +table(my_df$mutation_info) + +# subset df with dr muts only +my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide") +table(my_df_dr$mutation_info) + +######################################################################## +# end of data extraction and cleaning for plots # +######################################################################## + +#========================== +# Plot 1: ALL Muts +# x = mcsm_values, y = dist +# fill = stability +#============================ + +my_plot_name = 'lineage_dist_PS.svg' + +plot_lineage_duet = paste0(plotdir,"/", my_plot_name) + +#=================== +# Data for plots +#=================== +table(my_df$lineage); str(my_df$lineage) + +# subset only lineages1-4 +sel_lineages = c("lineage1" + , "lineage2" + , "lineage3" + , "lineage4" + #, "lineage5" + #, "lineage6" + #, "lineage7" + ) + +# uncomment as necessary +df_lin = subset(my_df, subset = lineage %in% sel_lineages ) +table(df_lin$lineage) + +# refactor +df_lin$lineage = factor(df_lin$lineage) + +sum(table(df_lin$lineage)) #{RESULT: Total number of samples for lineage} + +table(df_lin$lineage)#{RESULT: No of samples within lineage} + +length(unique(df_lin$mutationinformation))#{Result: No. of unique mutations the 4 lineages contribute to} + +length(df_lin$mutationinformation) + +u2 = unique(my_df$mutationinformation) +u = unique(df_lin$mutationinformation) +check = u2[!u2%in%u]; print(check) #{Muts not present within selected lineages} + +#%%%%%%%%%%%%%%%%%%%%%%%%% +# REASSIGNMENT +df <- df_lin +#%%%%%%%%%%%%%%%%%%%%%%%%% + +rm(df_lin) + +#****************** +# generate distribution plot of lineages +#****************** +# 2 : ggridges (good!) +my_ats = 15 # axis text size +my_als = 20 # axis label size + +my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4' + #, 'Lineage 5', 'Lineage 6', 'Lineage 7' + ) +names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4' + # , 'lineage5', 'lineage6', 'lineage7' + ) +# check plot name +plot_lineage_duet + +# output svg +#svg(plot_lineage_duet) +p1 = ggplot(df, aes(x = duet_scaled + , y = duet_outcome))+ + + #printFile=geom_density_ridges_gradient( + geom_density_ridges_gradient(aes(fill = ..x..) + , scale = 3 + , size = 0.3 ) + + facet_wrap( ~lineage + , scales = "free" + #, switch = 'x' + , labeller = labeller(lineage = my_labels) ) + + coord_cartesian( xlim = c(-1, 1)) + + scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4") + , name = "DUET" ) + + theme(axis.text.x = element_text(size = my_ats + , angle = 90 + , hjust = 1 + , vjust = 0.4) + + , axis.text.y = element_blank() + , axis.title.x = element_blank() + , axis.title.y = element_blank() + , axis.ticks.y = element_blank() + , plot.title = element_blank() + , strip.text = element_text(size = my_als) + , legend.text = element_text(size = my_als-5) + , legend.title = element_text(size = my_als) +) + +print(p1) +#dev.off() + +####################################################################### +# lineage distribution plot for dr_muts +####################################################################### + +#========================== +# Plot 2: dr muts ONLY +# x = mcsm_values, y = dist +# fill = stability +#============================ + +my_plot_name_dr = 'lineage_dist_dr_muts_PS.svg' + +plot_lineage_dr_duet = paste0(plotdir,"/", my_plot_name_dr) + +#=================== +# Data for plots +#=================== +table(my_df_dr$lineage); str(my_df_dr$lineage) + +# uncomment as necessary +df_lin_dr = subset(my_df_dr, subset = lineage %in% sel_lineages) +table(df_lin_dr$lineage) + +# refactor +df_lin_dr$lineage = factor(df_lin_dr$lineage) + +sum(table(df_lin_dr$lineage)) #{RESULT: Total number of samples for lineage} + +table(df_lin_dr$lineage)#{RESULT: No of samples within lineage} + +length(unique(df_lin_dr$mutationinformation))#{Result: No. of unique mutations the 4 lineages contribute to} + +length(df_lin_dr$mutationinformation) + +u2 = unique(my_df_dr$mutationinformation) +u = unique(df_lin_dr$mutationinformation) +check = u2[!u2%in%u]; print(check) #{Muts not present within selected lineages} + +#%%%%%%%%%%%%%%%%%%%%%%%%% +# REASSIGNMENT +df_dr <- df_lin_dr +#%%%%%%%%%%%%%%%%%%%%%%%%% + +rm(df_lin_dr) + +#****************** +# generate distribution plot of lineages +#****************** +# 2 : ggridges (good!) +my_ats = 15 # axis text size +my_als = 20 # axis label size + + +# check plot name +plot_lineage_dr_duet + +# output svg +#svg(plot_lineage_dr_duet) +p2 = ggplot(df_dr, aes(x = duet_scaled + , y = duet_outcome))+ + + #printFile=geom_density_ridges_gradient( + geom_density_ridges_gradient(aes(fill = ..x..) + , scale = 3 + , size = 0.3 ) + + facet_wrap( ~lineage + , scales = "free" + #, switch = 'x' + , labeller = labeller(lineage = my_labels) ) + + coord_cartesian( xlim = c(-1, 1) + #, ylim = c(0, 6) + #, clip = "off" + ) + + scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4") + , name = "DUET" ) + + theme(axis.text.x = element_text(size = my_ats + , angle = 90 + , hjust = 1 + , vjust = 0.4) + , axis.text.y = element_blank() + , axis.title.x = element_blank() + , axis.title.y = element_blank() + , axis.ticks.y = element_blank() + , plot.title = element_blank() + , strip.text = element_text(size = my_als) + , legend.text = element_text(size = 10) + , legend.title = element_text(size = my_als) + #, legend.position = "none" + ) + +print(p2) +#dev.off() +######################################################################## +#============== +# combine plot +#=============== + +svg(plot_lineage_dist_combined, width = 12, height = 6) + +printFile = cowplot::plot_grid(p1, p2 + , label_size = my_als+10) + +print(printFile) +dev.off() diff --git a/scripts/plotting/opp_mcsm_muts.R b/scripts/plotting/opp_mcsm_muts.R new file mode 100644 index 0000000..1ca1f6c --- /dev/null +++ b/scripts/plotting/opp_mcsm_muts.R @@ -0,0 +1,96 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: To write muts with opposite effects on +# protomer and ligand stability +######################################################### +# working dir and loading libraries + +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting/") +getwd() + +source("plotting_data.R") + +# should return the following dfs, directories and variables +# my_df +# my_df_u +# my_df_u_lig +# dup_muts + +cat(paste0("Directories imported:" + , "\ndatadir:", datadir + , "\nindir:", indir + , "\noutdir:", outdir + , "\nplotdir:", plotdir)) + +cat(paste0("Variables imported:" + , "\ndrug:", drug + , "\ngene:", gene + , "\ngene_match:", gene_match + , "\nLength of upos:", length(upos) + , "\nAngstrom symbol:", angstroms_symbol)) + +# clear excess variable +rm(my_df, upos, dup_muts) +#======================================================== +#=========== +# input +#=========== +#in_file1: output of plotting_data.R +# my_df_u + +#=========== +# output +#=========== +# mutations with opposite effects +out_filename_opp_muts = paste0(tolower(gene), "_muts_opp_effects.csv") +outfile_opp_muts = paste0(outdir, "/", out_filename_opp_muts) + +#%%=============================================================== + +# spelling Correction 1: DUET incase American spelling needed! +table(my_df_u$duet_outcome); sum(table(my_df_u$duet_outcome) ) +#my_df_u$duet_outcome[my_df_u$duet_outcome=="Stabilising"] <- "Stabilizing" +#my_df_u$duet_outcome[my_df_u$duet_outcome=="Destabilising"] <- "Destabilizing" + + +# spelling Correction 2: Ligand incase American spelling needed! +table(my_df_u$ligand_outcome); sum(table(my_df_u$ligand_outcome) ) +#my_df_u$ligand_outcome[my_df_u$ligand_outcome=="Stabilising"] <- "Stabilizing" +#my_df_u$ligand_outcome[my_df_u$ligand_outcome=="Destabilising"] <- "Destabilizing" + + +# muts with opposing effects on protomer and ligand stability +table(my_df_u$duet_outcome != my_df_u$ligand_outcome) +changes = my_df_u[which(my_df_u$duet_outcome != my_df_u$ligand_outcome),] + +# sanity check: redundant, but uber cautious! +dl_i = which(my_df_u$duet_outcome != my_df_u$ligand_outcome) +ld_i = which(my_df_u$ligand_outcome != my_df_u$duet_outcome) + +cat("Identifying muts with opposite stability effects") +if(nrow(changes) == (table(my_df_u$duet_outcome != my_df_u$ligand_outcome)[[2]]) & identical(dl_i,ld_i)) { + cat("PASS: muts with opposite effects on stability and affinity correctly identified" + , "\nNo. of such muts: ", nrow(changes)) +}else { + cat("FAIL: unsuccessful in extracting muts with changed stability effects") +} + +#========================== +# write file: changed muts +#========================== +write.csv(changes, outfile_opp_muts) + +cat("Finished writing file for muts with opp effects:" + , "\nFilename: ", outfile_opp_muts + , "\nDim:", dim(changes)) + +# clear variables +rm(out_filename_opp_muts, outfile_opp_muts) +rm(changes, dl_i, ld_i) + +# count na in each column +na_count = sapply(my_df_u, function(y) sum(length(which(is.na(y))))); na_count +df_ncols = ncol(my_df_u) + +#===================================== end of script diff --git a/scripts/plotting/or_plots_combined.R b/scripts/plotting/or_plots_combined.R new file mode 100644 index 0000000..a664f1b --- /dev/null +++ b/scripts/plotting/or_plots_combined.R @@ -0,0 +1,192 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: Bubble plot of OR for PS and Lig + +# Output: 1 svg + +#======================================================================= +# working dir and loading libraries +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting/") +getwd() + + +source("Header_TT.R") +require(cowplot) +source("combining_dfs_plotting.R") +# should return the following dfs, directories and variables + +# PS combined: +# 1) merged_df2 +# 2) merged_df2_comp +# 3) merged_df3 +# 4) merged_df3_comp + +# LIG combined: +# 5) merged_df2_lig +# 6) merged_df2_comp_lig +# 7) merged_df3_lig +# 8) merged_df3_comp_lig + +# 9) my_df_u +# 10) my_df_u_lig + +cat(paste0("Directories imported:" + , "\ndatadir:", datadir + , "\nindir:", indir + , "\noutdir:", outdir + , "\nplotdir:", plotdir)) + +cat(paste0("Variables imported:" + , "\ndrug:", drug + , "\ngene:", gene + , "\ngene_match:", gene_match + , "\nAngstrom symbol:", angstroms_symbol + , "\nNo. of duplicated muts:", dup_muts_nu + , "\nNA count for ORs:", na_count + , "\nNA count in df2:", na_count_df2 + , "\nNA count in df3:", na_count_df3)) + +#========================= +#======= +# output +#======= +or_combined = "or_combined_PS_LIG.svg" +plot_or_combined = paste0(plotdir,"/", or_combined) + +#or_kin_combined = "or_kin_combined_PS_LIG.svg" +#plot_or_kin_combined = paste0(plotdir,"/", or_kin_combined) +#======================================================================= + +########################### +# Data for OR and stability plots +# you need merged_df3_comp +# since these are matched +# to allow pairwise corr +########################### + +ps_df = merged_df3_comp +lig_df = merged_df3_comp_lig + +# Ensure correct data type in columns to plot: should be TRUE +is.numeric(ps_df$or_mychisq) +is.numeric(lig_df$or_mychisq) + +# delete variables not required +rm(merged_df2, merged_df2_comp, merged_df2_lig, merged_df2_comp_lig, my_df_u, my_df_u_lig) +#%% end of section 1 + +# sanity check: should be <10 +if (max(lig_df$ligand_distance) < 10){ + print ("Sanity check passed: lig data is <10Ang") +}else{ + print ("Error: data should be filtered to be within 10Ang") +} + +############# +# Plots: Bubble plot +# x = Position, Y = stability +# size of dots = OR +# col: stability +############# + +#----------------- +# Plot 1: DUET vs OR by position as geom_points +#------------------- + +my_ats = 20 # axis text size +my_als = 22 # axis label size + +# Spelling Correction: made redundant as already corrected at the source +#ps_df$duet_outcome[ps_df$duet_outcome=='Stabilizing'] <- 'Stabilising' +#ps_df$duet_outcome[ps_df$duet_outcome=='Destabilizing'] <- 'Destabilising' + +table(ps_df$duet_outcome) ; sum(table(ps_df$duet_outcome)) + +g1 = ggplot(ps_df, aes(x = factor(position) + , y = duet_scaled)) + +p1 = g1 + + geom_point(aes(col = duet_outcome + , size = or_mychisq))+ + #, size = or_kin)) + # not good, almost like log(or) + theme(axis.text.x = element_text(size = my_ats + , angle = 90 + , hjust = 1 + , vjust = 0.4) + , axis.text.y = element_text(size = my_ats + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_als) + , axis.title.y = element_text(size = my_als) + , legend.text = element_text(size = my_als) + , legend.title = element_text(size = my_als) ) + + #, legend.key.size = unit(1, "cm")) + + labs(title = "" + , x = "Position" + , y = "DUET(PS)" + , size = "Odds Ratio" + , colour = "DUET Outcome") + + guides(colour = guide_legend(override.aes = list(size=4))) + +p1 + +#------------------- +# generate plot 2: Lig vs OR by position as geom_points +#------------------- + +# Spelling Correction: made redundant as already corrected at the source + +#lig_df$ligand_outcome[lig_df$ligand_outcome=='Stabilizing'] <- 'Stabilising' +#lig_df$ligand_outcome[lig_df$ligand_outcome=='Destabilizing'] <- 'Destabilising' + +table(lig_df$ligand_outcome) + +g2 = ggplot(lig_df, aes(x = factor(position) + , y = affinity_scaled)) + +p2 = g2 + + geom_point(aes(col = ligand_outcome + , size = or_mychisq))+ + #, size = or_kin)) + # not good! almost like log(or) + theme(axis.text.x = element_text(size = my_ats + , angle = 90 + , hjust = 1 + , vjust = 0.4) + , axis.text.y = element_text(size = my_ats + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_als) + , axis.title.y = element_text(size = my_als) + , legend.text = element_text(size = my_als) + , legend.title = element_text(size = my_als) ) + + #, legend.key.size = unit(1, "cm")) + + labs(title = "" + , x = "Position" + , y = "Ligand Affinity" + , size = "Odds Ratio" + , colour = "Ligand Outcome" + ) + + guides(colour = guide_legend(override.aes = list(size=4))) + +p2 + +#====================== +# combine using cowplot +#====================== + +svg(plot_or_combined, width = 32, height = 12) +#svg(plot_or_kin_combined, width = 32, height = 12) + +theme_set(theme_gray()) # to preserve default theme + +printFile = cowplot::plot_grid(plot_grid(p1, p2 + , ncol = 1 + , align = 'v' + , labels = c("", "") + , label_size = my_als+5)) +print(printFile) +dev.off() + diff --git a/scripts/plotting/plotting_data.R b/scripts/plotting/plotting_data.R index dd5760e..6d907f1 100755 --- a/scripts/plotting/plotting_data.R +++ b/scripts/plotting/plotting_data.R @@ -15,7 +15,10 @@ library(ggplot2) library(data.table) library(dplyr) +source("dirs.R") + require("getopt", quietly = TRUE) #cmd parse arguments + #======================================================== # command line args #spec = matrix(c( @@ -31,19 +34,6 @@ require("getopt", quietly = TRUE) #cmd parse arguments # stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)") #} #======================================================== -#%% variable assignment: input and output paths & filenames -drug = "pyrazinamide" -gene = "pncA" -gene_match = paste0(gene,"_p.") -cat(gene_match) - -#============= -# directories -#============= -datadir = paste0("~/git/Data") -indir = paste0(datadir, "/", drug, "/input") -outdir = paste0("~/git/Data", "/", drug, "/output") -plotdir = paste0("~/git/Data", "/", drug, "/output/plots") #====== # input #====== @@ -52,6 +42,17 @@ in_filename_params = paste0(tolower(gene), "_all_params.csv") infile_params = paste0(outdir, "/", in_filename_params) cat(paste0("Input file 1:", infile_params) ) + +cat('columns based on variables:\n' + , drug + , '\n' + , dr_muts_col + , '\n' + , other_muts_col + , "\n" + , resistance_col + , '\n===============================================================') + #%%=============================================================== ########################### # Read file: struct params @@ -62,9 +63,6 @@ my_df = read.csv(infile_params, header = T) cat("\nInput dimensions:", dim(my_df)) -# quick checks -#colnames(my_df) -#str(my_df) ########################### # extract unique mutation entries diff --git a/scripts/plotting/ps_plots_combined.R b/scripts/plotting/ps_plots_combined.R new file mode 100644 index 0000000..afcdc25 --- /dev/null +++ b/scripts/plotting/ps_plots_combined.R @@ -0,0 +1,251 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: AF, OR and stability plots: PS +# Output: 1 SVG + +######################################################### +# Installing and loading required packages +########################################################## +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting/") +getwd() + +source("Header_TT.R") +library(ggplot2) +library(data.table) +#source("combining_dfs_plotting.R") +source("barplot_colour_function.R") +source("subcols_axis.R") + +########################### +# Data for PS plots +# you need merged_df3_comp/merged_df_comp +# since these have unique SNPs +########################### + +no_or_index = which(is.na(my_df_u_cols$or_mychisq)) + +my_df_u_cols_clean = my_df_u_cols[-no_or_index,] + +#%%%%%%%%%%%%%%%%%%%%%%%% +# REASSIGNMENT +df = my_df_u_cols_clean +#%%%%%%%%%%%%%%%%%%%%%%%%% +cols_to_select = colnames(mut_pos_cols) + +mut_pos_cols_clean = df[colnames(df)%in%cols_to_select] +mut_pos_cols_clean = unique(mut_pos_cols_clean[, 1:length(mut_pos_cols_clean)]) + +######################################################################## +# end of data extraction and cleaning for plots # +######################################################################## + +#=========================== +# Barplot with axis colours: +#=========================== +#================ +# Inspecting mut_pos_cols +# position numbers and colours and assigning axis colours based on lab_fg +# of the correct df +# open file from desktop ("sample_axis_cols") for cross checking +#================ +# very important! +#my_axis_colours = mut_pos_cols$lab_fg + +if (nrow(mut_pos_cols_clean) == length(unique(df$position)) ){ + print("PASS: lengths checked, assigning axis colours") + my_axis_colours = mut_pos_cols_clean$lab_fg + cat("length of axis colours:", length(my_axis_colours) + , "\nwhich corresponds to the number of positions on the x-axis of the plot") +}else{ + print("FAIL:lengths mismatch, could not assign axis colours") + quit() +} + +# unique positions +upos = unique(df$position); length(upos) + +table(df$duet_outcome) + +# add frequency of positions (from lib data.table) +setDT(df)[, pos_count := .N, by = .(position)] + +# this is cummulative +table(df$pos_count) + +# use group by on this +library(dplyr) +snpsBYpos_df <- df %>% + group_by(position) %>% + summarize(snpsBYpos = mean(pos_count)) + +table(snpsBYpos_df$snpsBYpos) + +snp_count = sort(unique(snpsBYpos_df$snpsBYpos)) + +# sanity checks +# should be a factor +if (is.factor(df$duet_outcome)){ + print("duet_outcome is factor") +}else{ + print("converting duet_outcome to factor") + df$duet_outcome = as.factor(df$duet_outcome) +} + +is.factor(df$duet_outcome) + +# should be -1 and 1 +min(df$duet_scaled) +max(df$duet_scaled) + +# sanity checks +tapply(df$duet_scaled, df$duet_outcome, min) +tapply(df$duet_scaled, df$duet_outcome, max) + +# My colour FUNCTION: based on group and subgroup +# in my case; +# df = df +# group = duet_outcome +# subgroup = normalised score i.e duet_scaled + +# check unique values in normalised data +u = unique(df$duet_scaled) +cat("No. of unique values in normalised data:", length(u)) + +# Define group +# Create an extra column called group which contains the "gp name and score" +# so colours can be generated for each unique values in this column +my_grp = df$duet_scaled # no rounding +df$group <- paste0(df$duet_outcome, "_", my_grp, sep = "") + +# Call the function to create the palette based on the group defined above +colours <- ColourPalleteMulti(df, "duet_outcome", "my_grp") +print(paste0("Colour palette generated for: ", length(colours), " colours")) +my_title = "Protein stability (DUET)" +cat("No. of axis colours: ", length(my_axis_colours)) + +#****************** +# generate plot: barplot unordered by frequency with axis colours +#****************** +class(df$lab_bg) + +# define cartesian coord +my_xlim = length(unique(df$position)); my_xlim + +# axis label size +my_xals = 18 +my_yals = 18 + +# axes text size +my_xats = 14 +my_yats = 18 + +g3 = ggplot(df, aes(factor(position, ordered = T))) + +p3 = g3 + + coord_cartesian(xlim = c(1, my_xlim) + #, ylim = c(0, 6) + , ylim = c(0, max(snp_count)) + , clip = "off") + + geom_bar(aes(fill = group), colour = "grey") + + scale_fill_manual(values = colours + , guide = "none") + + geom_tile(aes(,-0.8, width = 0.95, height = 0.85) + , fill = df$lab_bg) + + geom_tile(aes(,-1.2, width = 0.95, height = -0.2) + , fill = df$lab_bg2) + + + # Here it"s important to specify that your axis goes from 1 to max number of levels + theme(axis.text.x = element_text(size = my_xats + , angle = 90 + , hjust = 1 + , vjust = 0.4 + , colour = my_axis_colours) + , axis.text.y = element_text(size = my_yats + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_text(size = my_xals) + , axis.title.y = element_text(size = my_yals+2 ) + , axis.ticks.x = element_blank()) + + labs(title = "" + #title = my_title + , x = "Position" + , y = "Frequency") + +p3 +#================= +# generate plot: AF by position unordered, not coloured +#================= +my_ats = 20 # axis text size +my_als = 22 # axis label size + +g1 = ggplot(df) +p1 = g1 + + geom_bar(aes(x = factor(position, ordered = T) + , y = af*100 + #, fill = duet_outcome + ) + , color = "black" + , fill = "lightgrey" + , stat = "identity") + + theme(axis.text.x = element_blank() + , axis.text.y = element_text(size = my_ats + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_blank() + , axis.title.y = element_text(size = my_als) ) + + labs(title = "" + #, size = 100 + #, x = "Wild-type position" + , y = "AF(%)") + +p1 +#================= +# generate plot: OR by position unordered +#================= +my_ats = 20 # axis text size +my_als = 22 # axis label size + + +g2 = ggplot(df) + +p2 = g2 + + geom_bar(aes(x = factor(position, ordered = T) + , y = or_mychisq + #, fill = duet_outcome + ) + , colour = "black" + , fill = "lightgray" + , stat = "identity") + + theme(axis.text.x = element_blank() + , axis.text.y = element_text(size = my_ats + , angle = 0 + , hjust = 1 + , vjust = 0) + , axis.title.x = element_blank() + , axis.title.y = element_text(size = my_als) ) + + labs(#title = "OR by position" + #, x = "Wild-type position" + y = "OR") + +p2 + +######################################################################## +# end of DUET barplots # +######################################################################## +#======= +# output +#======= +ps_plots_combined = "af_or_combined_PS.svg" +plot_ps_plots_combined = paste0(plotdir,"/", ps_plots_combined ) + +svg(plot_ps_plots_combined , width = 26, height = 12) + +#theme_set(theme_gray()) # to preserve default theme +printFile = cowplot::plot_grid(p1, p2, p3 + , ncol = 1 + , align = 'v') +printFile +dev.off() diff --git a/scripts/plotting/resolving_ambiguous_muts.R b/scripts/plotting/resolving_ambiguous_muts.R new file mode 100644 index 0000000..8edb806 --- /dev/null +++ b/scripts/plotting/resolving_ambiguous_muts.R @@ -0,0 +1,136 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: To resolve ambiguous muts present in _metadata.csv +#(which is one of the outputs from python script) + +# Input csv file: _metadata.csv + +# Output csv file: _metadata_clean.csv + +######################################################### +#======================================================================= +# working dir and loading libraries +getwd() +setwd("~/git/LSHTM_analysis/scripts/plotting/") +getwd() + +source("dirs.R") + +cat("Directories imported:" + , "\n====================" + , "\ndatadir:", datadir + , "\nindir:", indir + , "\noutdir:", outdir + , "\nplotdir:", plotdir) + +cat("Variables imported:" + , "\n=====================" + , "\ndrug:", drug + , "\ngene:", gene + , "\ngene_match:", gene_match + , "\ndr_muts_col:", dr_muts_col + , "\nother_muts_col:", other_muts_col + , "\ndrtype_col:", resistance_col) + +#======================================================== +#=========== +# input +#=========== +# infile1: gene associated meta data +gene_metadata_raw = paste0(tolower(gene), "_metadata_raw.csv") +infile_gene_metadata_raw = paste0(outdir, "/", gene_metadata_raw) +cat("Input infile 1:", infile_gene_metadata_raw) + +# infile 2: ambiguous muts +ambiguous_muts = paste0(tolower(gene), "_ambiguous_mut_names.csv") +infile_ambiguous_muts = paste0(outdir, "/", ambiguous_muts) +cat("Input infile 2:", infile_ambiguous_muts) + +#=========== +# output +#=========== +# clean gene_metadat with ambiguous muts resolved +gene_metadata_clean = paste0(tolower(gene), "_metadata.csv") +outfile_gene_metadata_clean = paste0(outdir, "/", gene_metadata_clean) +cat("Output file:", outfile_gene_metadata_clean) + +#%%=============================================================== + +########################### +# Read file: _meta data raw.csv +########################### +cat("Reading meta data file:", infile_gene_metadata_raw) + +gene_metadata_raw <- read.csv(infile_gene_metadata_raw + , stringsAsFactors = F + , header = T) + +cat("Dim:", dim(gene_metadata_raw)) + +########################### +# Read file: ambiguous muts.csv +########################## +ambiguous_muts = read.csv(infile_ambiguous_muts) + +ambiguous_muts_names = ambiguous_muts$mutation + +common_muts_all = gene_metadata_raw[gene_metadata_raw$mutation%in%ambiguous_muts_names,] + +#============== +# resolving ambiguous muts +#=============== +table(gene_metadata_raw$mutation_info) +count_check = as.data.frame(cbind(table(gene_metadata_raw$mutationinformation, gene_metadata_raw$mutation_info))) +count_check$checks = ifelse(count_check[[dr_muts_col]]&count_check[[other_muts_col]]>0 + , "ambi", "pass") +table(count_check$checks) + +cat(table(count_check$checks)[["ambi"]], "ambiguous muts detected. Correcting these") + +# remove all ambiguous muts from df +ids = gene_metadata_raw$mutation%in%common_muts_all$mutation; table(ids) +gene_metadata_raw_unambiguous = gene_metadata_raw[!ids,] + +# sanity checks: should be true +table(gene_metadata_raw_unambiguous$mutation%in%common_muts_all$mutation)[[1]] == nrow(gene_metadata_raw_unambiguous) +nrow(gene_metadata_raw_unambiguous) + nrow(common_muts_all) == nrow(gene_metadata_raw) + +# check before resolving ambiguous muts +table(common_muts_all$mutation_info) +common_muts_all$mutation_info = as.factor(common_muts_all$mutation_info) + +# resolving ambiguous muts: change other_muts to dr_muts +common_muts_all$mutation_info[common_muts_all$mutation_info==other_muts_col] <- dr_muts_col + +table(common_muts_all$mutation_info) +common_muts_all$mutation_info = factor(common_muts_all$mutation_info) +table(common_muts_all$mutation_info) +common_muts_all$mutation_info = as.character(common_muts_all$mutation_info) + +# combining to a clean df +gene_metadata = rbind(gene_metadata_raw_unambiguous, common_muts_all) +all(dim(gene_metadata) == dim(gene_metadata)) +table(gene_metadata$mutation_info) + +count_check2 = as.data.frame(cbind(table(gene_metadata$mutationinformation + , gene_metadata$mutation_info))) + +count_check2$checks = ifelse(count_check2[[dr_muts_col]]&count_check2[[other_muts_col]]>0 + , "ambi", "pass") + +if (table(count_check2$checks) == nrow(count_check2)){ + cat("PASS: ambiguous muts successfully resolved." + , "\nwriting file") +}else{ + print("FAIL: ambiguous muts could not be resolved!") + quit() +} +#============ +# writing file +#============= +write.csv(gene_metadata, outfile_gene_metadata_clean + , row.names = FALSE) + +#===================================================================== +# End of script +#====================================================================== diff --git a/scripts/plotting/running_plot_scripts b/scripts/plotting/running_plot_scripts index 4abdd9a..6749fa1 100644 --- a/scripts/plotting/running_plot_scripts +++ b/scripts/plotting/running_plot_scripts @@ -1,12 +1,26 @@ #======== # plotting #======== +FIXME: + +#======================= +dirs.R +#======================= +#dirs.R: imports dir structure +change this to cmd so you can pass in drug and gene name #======================= plotting_data.R #======================= ??? update how to run + +#======================= +combining_dfs_plotting.R +#======================= +??? update how to run + + #======================= mcsm_mean_stability.R #======================= @@ -58,6 +72,13 @@ barplots_subcolours_PS.R # output plots: 1 svg 1) barplot_coloured_PS.svg +#======================= +lineage_dist_PS.R +# lineage distribution for all muts and dr muts in one figure +#======================= +# input: calls the "combining_dfs_plotting.R" +# output plots: 1 svg + 1) lineage_dist_combined_PS.svg ######################################################################## diff --git a/scripts/plotting/test.R b/scripts/plotting/test.R new file mode 100644 index 0000000..8033517 --- /dev/null +++ b/scripts/plotting/test.R @@ -0,0 +1,96 @@ +setwd("/home/tanu/git/LSHTM_analysis/scripts/plotting") + +source("combining_dfs_plotting.R") + +table(merged_df3$mutation_info) + +# assign foldx +#ddg<0 = "Stabilising" (-ve) +table(merged_df3$ddg < 0) +merged_df3$foldx_outcome = ifelse(merged_df3$ddg < 0, "Stabilising", "Destabilising") + +#=========== +# PS data +#=========== +dr_muts = merged_df3[merged_df3$mutation_info == "dr_mutations_pyrazinamide",] +other_muts = merged_df3[merged_df3$mutation_info == "other_mutations_pyrazinamide",] + +par(mfrow = c(1,1)) +par(mfrow = c(2,6)) + +# mcsm duet +boxplot(dr_muts$duet_scaled, other_muts$duet_scaled, main = "DUET" + #, col = factor(merged_df3$duet_outcome) + ) +wilcox.test(dr_muts$duet_scaled, other_muts$duet_scaled, paired = F) + +# foldx ddg +boxplot(dr_muts$ddg, other_muts$ddg, main = "Foldx") +wilcox.test(dr_muts$ddg, other_muts$ddg, paired = F) + +# rd +boxplot(dr_muts$rd_values, other_muts$rd_values, main = "RD") +wilcox.test(dr_muts$rd_values, other_muts$rd_values) + +# kd +boxplot(dr_muts$kd_values, other_muts$kd_values, main = "KD") +wilcox.test(dr_muts$kd_values, other_muts$kd_values) + +# asa +boxplot(dr_muts$asa, other_muts$asa, main = "ASA") +wilcox.test(dr_muts$asa, other_muts$asa) + +# rsa +boxplot(dr_muts$rsa, other_muts$rsa, main = "RSA") +wilcox.test(dr_muts$rsa, other_muts$rsa) + +#=================================================================== +#========== +# LIG data +#========== +dr_muts_lig = merged_df3_lig[merged_df3_lig$mutation_info == "dr_mutations_pyrazinamide",] +other_muts_lig = merged_df3_lig[merged_df3_lig$mutation_info == "other_mutations_pyrazinamide",] + +# mcsm ligand affinity +boxplot(dr_muts_lig$duet_scaled, other_muts_lig$duet_scaled, main = "Ligand affinity") +wilcox.test(dr_muts_lig$duet_scaled, other_muts_lig$duet_scaled, paired = F) + +# rd +boxplot(dr_muts_lig$rd_values, other_muts_lig$rd_values, main = "RD") +wilcox.test(dr_muts_lig$rd_values, other_muts_lig$rd_values) + +# kd +boxplot(dr_muts_lig$kd_values, other_muts_lig$kd_values, main = "KD") +wilcox.test(dr_muts_lig$kd_values, other_muts_lig$kd_values) + +# asa +boxplot(dr_muts_lig$asa, other_muts_lig$asa, main = "ASA") +wilcox.test(dr_muts_lig$asa, other_muts_lig$asa) + +# rsa +boxplot(dr_muts_lig$rsa, other_muts_lig$rsa, main = "RSA") +wilcox.test(dr_muts_lig$rsa, other_muts_lig$rsa) + +# checking agreement b/w mcsm and foldx +cols_to_select = c("mutationinformation" + , "mutation_info" + , "duet_scaled" + , "ddg" + , "duet_outcome" + , "foldx_outcome") + +merged_df3_short = select(merged_df3, cols_to_select) + +mcsm_foldx = merged_df3_short[which(merged_df3_short$duet_outcome != merged_df3_short$foldx_outcome),] + +mcsm_foldx$sign_comp = ifelse(sign(mcsm_foldx$duet_scaled)==sign(mcsm_foldx$ddg), "PASS", "FAIL") +table(mcsm_foldx$sign_comp) + +# another way of checking +merged_df3$sign_comp = ifelse(sign(merged_df3$duet_scaled)==sign(merged_df3$ddg), "PASS", "FAIL") +table(merged_df3$sign_comp) + +disagreement = table(merged_df3$sign_comp)[2]/nrow(merged_df3)*100 +agreement = 100 - disagreement + +cat("There is", agreement, "% between mcsm and foldx predictions")