Merge branch 'master' of github.com:tgttunstall/LSHTM_analysis

2020-09-10 16:14:46 +01:00 · 2020-09-10 16:14:46 +01:00 · 5102bbea1b
commit 5102bbea1b
parent f415b0b239 fc47c58f91
21 changed files with 2132 additions and 243 deletions
--- a/.gitignore
+++ b/.gitignore
@ -13,3 +13,4 @@ scratch
 test
 plotting_test
 scripts_old
 foldx/test/
--- a/scripts/combining_dfs.py
+++ b/scripts/combining_dfs.py
@ -54,6 +54,8 @@ os.getcwd()
 # FIXME: local imports
 #from combining import combine_dfs_with_checks
 from combining_FIXME import detect_common_cols
 from reference_dict import oneletter_aa_dict # CHECK DIR STRUC THERE!
 from reference_dict import low_3letter_dict # CHECK DIR STRUC THERE!
 #=======================================================================
 #%% command line args
 arg_parser = argparse.ArgumentParser()
@ -71,13 +73,27 @@ args = arg_parser.parse_args()
 #%% variable assignment: input and output 
 #drug = 'pyrazinamide'
 #gene = 'pncA'
 gene_match = gene + '_p.'
 drug    = args.drug
 gene    = args.gene
 datadir = args.datadir
 indir   = args.input_dir
 outdir  = args.output_dir
 gene_match = gene + '_p.'
 print('mut pattern for gene', gene, ':',  gene_match)
 nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}'
 print('nsSNP for gene', gene, ':',  nssnp_match)
 wt_regex = gene_match.lower()+'([A-Za-z]{3})'
 print('wt regex:', wt_regex)
 mut_regex = r'[0-9]+(\w{3})$'
 print('mt regex:', mut_regex)
 pos_regex = r'([0-9]+)'
 print('position regex:', pos_regex)
 #%%=======================================================================
 #==============
 # directories
@ -155,6 +171,8 @@ ncols_m1 = len(mcsm_foldx_dfs.columns)
 print('\n\nResult of first merge:', mcsm_foldx_dfs.shape
      , '\n===================================================================')
 mcsm_foldx_dfs[merging_cols_m1].apply(len)
 mcsm_foldx_dfs[merging_cols_m1].apply(len) == len(mcsm_foldx_dfs)
 #%%============================================================================
 print('==================================='
      , '\nSecond merge: dssp + kd'
@ -183,6 +201,8 @@ ncols_m3 = len(dssp_kd_rd_dfs.columns)
 print('\n\nResult of Third merge:', dssp_kd_rd_dfs.shape
      , '\n===================================================================')
 dssp_kd_rd_dfs[merging_cols_m3].apply(len)
 dssp_kd_rd_dfs[merging_cols_m3].apply(len) == len(dssp_kd_rd_dfs)
 #%%============================================================================
 print('======================================='
      , '\nFourth merge: First merge + Third merge'
@ -203,12 +223,16 @@ else:
 print('\nResult of Fourth merge:', combined_df.shape
      , '\n===================================================================')
 combined_df[merging_cols_m4].apply(len)
 combined_df[merging_cols_m4].apply(len) == len(combined_df)
 #%%============================================================================
 # OR merges: TEDIOUSSSS!!!!
-
+del(mcsm_df, foldx_df, mcsm_foldx_dfs, dssp_kd_dfs, dssp_kd_rd_dfs,rd_df, kd_df, infile_mcsm, infile_foldx, infile_dssp, infile_kd)
-#%%RRRR
+del(merging_cols_m1, merging_cols_m2, merging_cols_m3, merging_cols_m4)
 del(in_filename_dssp, in_filename_foldx, in_filename_kd, in_filename_mcsm, in_filename_rd)
 #%%
 print('==================================='
      , '\nFifth merge: afor_df + afor_kin_df'
      , '\n===================================')
@ -220,8 +244,6 @@ afor_df = pd.read_csv(infile_afor, sep = ',')
 afor_kin_df = pd.read_csv(infile_afor_kin, sep = ',')
 afor_kin_df.columns = afor_kin_df.columns.str.lower()
 merging_cols_m5 = detect_common_cols(afor_df, afor_kin_df)
 print('Dim of afor_df:', afor_df.shape
@ -230,7 +252,7 @@ print('Dim of afor_df:', afor_df.shape
 # finding if ALL afor_kin_df muts are present in afor_df
 # i.e all kinship muts should be PRESENT in mycalcs_present
 if len(afor_kin_df[afor_kin_df['mutation'].isin(afor_df['mutation'])]) == afor_kin_df.shape[0]:
-    print('PASS: ALL or_kinship muts are present in my or list')
+    print('PASS: ALL', len(afor_kin_df), 'or_kinship muts are present in my or list')
 else:
    nf_muts = len(afor_kin_df[~afor_kin_df['mutation'].isin(afor_df['mutation'])])
    nf_muts_df = afor_kin_df[~afor_kin_df['mutation'].isin(afor_df['mutation'])]    
@ -244,7 +266,7 @@ common_muts = len(afor_df[afor_df['mutation'].isin(afor_kin_df['mutation'])])
 extra_muts_myor = afor_kin_df.shape[0] - common_muts
 print('=========================================='
-      , '\nmy or calcs', extra_muts_myor, 'extra mutation\n'
+      , '\nmy or calcs has', common_muts, 'present in af_or_kin_df'
      , '\n==========================================')
 print('Expected cals for merging with outer_join...')
@ -252,23 +274,29 @@ print('Expected cals for merging with outer_join...')
 expected_rows = afor_df.shape[0] + extra_muts_myor
 expected_cols = afor_df.shape[1] + afor_kin_df.shape[1] - len(merging_cols_m5)
 afor_df['mutation']
 afor_kin_df['mutation']
 ors_df = pd.merge(afor_df, afor_kin_df, on = merging_cols_m5, how = o_join)
 if ors_df.shape[0] == expected_rows and ors_df.shape[1] == expected_cols:
-    print('PASS: OR dfs successfully combined! PHEWWWW!')
+    print('PASS but with duplicate muts: OR dfs successfully combined! PHEWWWW!'
          , '\nDuplicate muts present but with different \'ref\' and \'alt\' alleles')
 else:
    print('FAIL: could not combine OR dfs'
          , '\nCheck expected rows and cols calculation and join type')
 print('Dim of merged ors_df:', ors_df.shape)
 ors_df[merging_cols_m5].apply(len)
 ors_df[merging_cols_m5].apply(len) == len(ors_df)
 #%%============================================================================
 # formatting ors_df
 ors_df.columns
 # Dropping unncessary columns: already removed in ealier preprocessing
 #cols_to_drop = ['reference_allele', 'alternate_allele', 'symbol' ]
 cols_to_drop = ['n_miss']
 print('Dropping', len(cols_to_drop), 'columns:\n'
      , cols_to_drop)
@ -283,18 +311,13 @@ column_order = ['mutation'
                , 'wild_type'               
                , 'position'
                , 'mutant_type'                              
                #, 'chr_num_allele' #old
                , 'ref_allele'
                , 'alt_allele'
                , 'mut_info_f1'
                , 'mut_info_f2'
                , 'mut_type'
                , 'gene_id'
                #, 'gene_number' #old
                , 'gene_name'
                #, 'mut_region'                
                #, 'reference_allele'
                #, 'alternate_allele'
                , 'chromosome_number'                
                , 'af'
                , 'af_kin'               
@ -317,14 +340,9 @@ column_order = ['mutation'
                , 'se_kin'                            
                , 'zval_logistic'
                , 'logl_h1_kin'
-                , 'l_remle_kin'
+                , 'l_remle_kin']
                #, 'wt_3let' # old
                #, 'mt_3let' # old
                #, 'symbol'
                #, 'n_miss'
                ]
-if ( (len(column_order) == ors_df.shape[1]) and (DataFrame(column_order).isin(ors_df.columns).all().all()):
+if ( (len(column_order) == ors_df.shape[1]) and (DataFrame(column_order).isin(ors_df.columns).all().all())):
    print('PASS: Column order generated for all:', len(column_order), 'columns'
          , '\nColumn names match, safe to reorder columns'
          , '\nApplying column order to df...' )
@ -337,6 +355,61 @@ else:
 print('\nResult of Sixth merge:', ors_df_ordered.shape
      , '\n===================================================================')
 #%%
 ors_df_ordered.shape
 check = ors_df_ordered[['mutationinformation','mutation', 'wild_type', 'position', 'mutant_type']]
 # populating 'nan' info
 lookup_dict = dict()
 for k, v in low_3letter_dict.items():
    lookup_dict[k] = v['one_letter_code']
    #print(lookup_dict)
    wt = ors_df_ordered['mutation'].str.extract(wt_regex).squeeze()
    #print(wt)
    ors_df_ordered['wild_type'] = wt.map(lookup_dict)
    ors_df_ordered['position'] = ors_df_ordered['mutation'].str.extract(pos_regex) 
    mt = ors_df_ordered['mutation'].str.extract(mut_regex).squeeze()
    ors_df_ordered['mutant_type'] = mt.map(lookup_dict)
 ors_df_ordered['mutationinformation'] = ors_df_ordered['wild_type'] + ors_df_ordered.position.map(str) + ors_df_ordered['mutant_type']
 check = ors_df_ordered[['mutationinformation','mutation', 'wild_type', 'position', 'mutant_type']]
 # populate mut_info_f1
 ors_df_ordered['mut_info_f1'].isna().sum()
 ors_df_ordered['mut_info_f1'] = ors_df_ordered['position'].astype(str) + ors_df_ordered['wild_type'] + '>' + ors_df_ordered['position'].astype(str) + ors_df_ordered['mutant_type']
 ors_df_ordered['mut_info_f1'].isna().sum()  
 # populate mut_info_f2
 ors_df_ordered['mut_info_f2'] = ors_df_ordered['mutation'].str.replace(gene_match.lower(), 'p.', regex = True)
 # populate mut_type
 ors_df_ordered['mut_type'].isna().sum()
 #mut_type_word = ors_df_ordered['mut_type'].value_counts()
 mut_type_word  = 'missense' # FIXME, should be derived
 ors_df_ordered['mut_type'].fillna(mut_type_word, inplace = True)
 ors_df_ordered['mut_type'].isna().sum()
 # populate gene_id
 ors_df_ordered['gene_id'].isna().sum()
 #gene_id_word = ors_df_ordered['gene_id'].value_counts()
 gene_id_word  = 'Rv2043c' # FIXME, should be derived
 ors_df_ordered['gene_id'].fillna(gene_id_word, inplace = True)
 ors_df_ordered['gene_id'].isna().sum()
 # populate gene_name
 ors_df_ordered['gene_name'].isna().sum()
 ors_df_ordered['gene_name'].value_counts()
 ors_df_ordered['gene_name'].fillna(gene, inplace = True)
 ors_df_ordered['gene_name'].isna().sum()
 # check numbers
 ors_df_ordered['or_kin'].isna().sum()
 # should be 0
 ors_df_ordered['or_mychisq'].isna().sum()
 #%%============================================================================ 
 print('==================================='
      , '\nSixth merge: Fourth + Fifth merge'
@ -345,52 +418,158 @@ print('==================================='
 #combined_df_all = combine_dfs_with_checks(combined_df, ors_df_ordered, my_join = i_join)
 merging_cols_m6 = detect_common_cols(combined_df, ors_df_ordered) 
 # dtype problems
 if len(merging_cols_m6) > 1 and 'position'in merging_cols_m6:
    print('Removing \'position\' from merging_cols_m6 to make dtypes consistent'
          , '\norig length of merging_cols_m6:', len(merging_cols_m6))
    merging_cols_m6.remove('position')
    print('\nlength after removing:', len(merging_cols_m6))
 print('Dim of df1:', combined_df.shape
      , '\nDim of df2:', ors_df_ordered.shape
      , '\nNo. of merging_cols:', len(merging_cols_m6))
 print('Checking mutations in the two dfs:'
-      , '\nmuts in df1 but NOT in df2:'
+      , '\nmuts in df1 present in df2:'
      , combined_df['mutationinformation'].isin(ors_df_ordered['mutationinformation']).sum()
-      , '\nmuts in df2 but NOT in df1:'
+      , '\nmuts in df2 present in df1:'
      , ors_df_ordered['mutationinformation'].isin(combined_df['mutationinformation']).sum())
-#print('\nNo. of common muts:', np.intersect1d(combined_df['mutationinformation'], ors_df_ordered['mutationinformation']) )
+#----------
-
+# merge 6
-#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+#----------
-combined_df_all = pd.merge(combined_df, ors_df, on = merging_cols_m6, how = l_join)
+combined_df_all = pd.merge(combined_df, ors_df_ordered, on = merging_cols_m6, how = o_join)
 combined_df_all.shape
-# FIXME: DIM
+# sanity check for merge 6
-# only with left join!
+outdf_expected_rows = len(combined_df) + extra_muts_myor
-outdf_expected_rows = len(combined_df)
+unique_muts = len(combined_df)
 outdf_expected_cols = len(combined_df.columns) + len(ors_df_ordered.columns) - len(merging_cols_m6)
-#if combined_df_all.shape[1] == outdf_expected_cols and combined_df_all.shape[0] == outdf_expected_rows:  
+if combined_df_all.shape[0] == outdf_expected_rows and combined_df_all.shape[1] == outdf_expected_cols and combined_df_all['mutationinformation'].nunique() == unique_muts:
 if combined_df_all.shape[1] == outdf_expected_cols and combined_df_all['mutationinformation'].nunique() == outdf_expected_rows:
    print('PASS: Df dimension match'
-          , '\nDim of combined_df_all with join type:', l_join
+          , '\ncombined_df_all with join type:', o_join
          , '\n', combined_df_all.shape
          , '\n===============================================================')
 else:
    print('FAIL: Df dimension mismatch'
             , 'Cannot generate expected dim. See details of merge performed'
             , '\ndf1 dim:', combined_df.shape
-             , '\ndf2 dim:', ors_df.shape
+             , '\ndf2 dim:', ors_df_ordered.shape
             , '\nGot:', combined_df_all.shape
             , '\nmuts in df1 but NOT in df2:'
-             , combined_df['mutationinformation'].isin(ors_df['mutationinformation']).sum()
+             , combined_df['mutationinformation'].isin(ors_df_ordered['mutationinformation']).sum()
             , '\nmuts in df2 but NOT in df1:'
             , ors_df['mutationinformation'].isin(combined_df['mutationinformation']).sum())
    sys.exit()
-#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ 
-# nan in mutation col
+# drop extra cols
-# FIXME: should get fixmed with JP's resolved dataset!?
+all_cols = combined_df_all.columns
-combined_df_all['mutation'].isna().sum()
+
-baz = combined_df_all[combined_df_all['mutation'].isna()]
+#pos_cols_check = combined_df_all[['position_x','position_y']]
 c = combined_df_all[['position_x','position_y']].isna().sum()
 pos_col_to_drop = c.index[c>0].to_list()
 cols_to_drop = pos_col_to_drop +  ['wild_type_kd']   
 print('Dropping', len(cols_to_drop), 'columns:\n', cols_to_drop)
 combined_df_all.drop(cols_to_drop, axis = 1, inplace = True) 
 # rename position_x to position
 pos_col_to_rename = c.index[c==0].to_list()
 combined_df_all.shape
 combined_df_all.rename(columns = { pos_col_to_rename[0]: 'position'}, inplace = True)
 combined_df_all.shape
 all_cols = combined_df_all.columns
 #%% reorder cols to for convenience
 first_cols = ['mutationinformation','mutation', 'wild_type', 'position', 'mutant_type']
 last_cols = [col for col in combined_df_all.columns if col not in first_cols]
 combined_df_all = combined_df_all[first_cols+last_cols]
 #%% IMPORTANT: check if mutation related info is all populated after this merge  
 # select string colnames to ensure no NA exist there
 string_cols = combined_df_all.columns[combined_df_all.applymap(lambda x: isinstance(x, str)).all(0)]
 if (combined_df_all[string_cols].isna().sum(axis = 0)== 0).all():
    print('PASS: All string cols are populated with no NAs')
 else:
    print('FAIL: NAs detected in string cols')
    print(combined_df_all[string_cols].isna().sum(axis = 0))
    sys.exit()
 # relevant mut cols
 check_mut_cols = merging_cols_m5 + merging_cols_m6
 count_na_mut_cols = combined_df_all[check_mut_cols].isna().sum().reset_index().rename(columns = {'index': 'col_name', 0: 'na_count'})
 print(check_mut_cols)
 c2 = combined_df_all[check_mut_cols].isna().sum()
 missing_info_cols = c2.index[c2>0].to_list()
 if c2.sum()>0:
    #na_muts_n = combined_df_all['mutation'].isna().sum() 
    na_muts_n = combined_df_all[missing_info_cols].isna().sum() 
    print(na_muts_n.values[0], 'mutations have missing \'mutation\' info.'
          , '\nFetching these from reference dict...')
 else:
    print('No missing \'mutation\' has been detected!')
 lookup_dict = dict()
 for k, v in oneletter_aa_dict.items():
    lookup_dict[k] = v['three_letter_code_lower']
    print(lookup_dict)
    wt_3let = combined_df_all['wild_type'].map(lookup_dict)
    #print(wt_3let)
    pos = combined_df_all['position'].astype(str)
    #print(pos)
    mt_3let = combined_df_all['mutant_type'].map(lookup_dict)
    #print(mt_3let)
    # override the 'mutation' column
    combined_df_all['mutation'] = 'pnca_p.' + wt_3let + pos + mt_3let
    print(combined_df_all['mutation'])    
 # check again
 if combined_df_all[missing_info_cols].isna().sum().all() == 0: 
    print('PASS: No mutations have missing \'mutation\' info.')
 else:
    print('FAIL:', combined_df_all[missing_info_cols].isna().sum().values[0]
          , '\nmutations have missing info STILL...')
    sys.exit()
 #%% check
 foo = combined_df_all.drop_duplicates('mutationinformation')
 foo2 = combined_df_all.drop_duplicates('mutation')
 if foo.equals(foo2):
    print('PASS: Dropping mutation or mutatationinformation has the same effect\n')
 else:
    print('FAIL: Still problems in merged data')
    sys.exit()
 #%%============================================================================
 output_cols = combined_df_all.columns
-print('Output cols:', output_cols)
+
 #%% IMPORTANT result info
 if combined_df_all['or_mychisq'].isna().sum() == len(combined_df) - len(afor_df):
    print('PASS: No. of NA in or_mychisq matches expected length'
          , '\nNo. of with NA in or_mychisq:', combined_df_all['or_mychisq'].isna().sum() 
          , '\nNo. of NA in or_kin:', combined_df_all['or_kin'].isna().sum())
 else:
    print('FAIL: No. of NA in or_mychisq does not match expected length')
 if combined_df_all.shape[0] == outdf_expected_rows:
    print('\nINFORMARIONAL ONLY: combined_df_all has duplicate muts present but with unique ref and alt allele'
          , '\n=============================================================')
 else:
    print('combined_df_all has no duplicate muts present'
          ,'\n===============================================================')
 print('\nDim of combined_data:', combined_df_all.shape
      , '\nNo. of unique mutations:', combined_df_all['mutationinformation'].nunique())
 #%%============================================================================ 
 # write csv
--- a/scripts/data_extraction.py
+++ b/scripts/data_extraction.py
@ -45,8 +45,6 @@ Created on Tue Aug  6 12:56:03 2019
 #5. chain
 #6. wild_pos
 #7. wild_chain_pos
 #=======================================================================
 #%% load libraries
 import os, sys
@ -83,16 +81,16 @@ gene = args.gene
 gene_match = gene + '_p.'
 print('mut pattern for gene', gene, ':',  gene_match)
-nssnp_match = gene_match +'[A-Z]{3}[0-9]+[A-Z]{3}'
+nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}'
 print('nsSNP for gene', gene, ':',  nssnp_match)
-wt_regex = gene_match.lower()+'(\w{3})'
+wt_regex = gene_match.lower()+'([A-Za-z]{3})'
 print('wt regex:', wt_regex)
-mut_regex = r'\d+(\w{3})$'
+mut_regex = r'[0-9]+(\w{3})$'
 print('mt regex:', mut_regex)
-pos_regex = r'(\d+)'
+pos_regex = r'([0-9]+)'
 print('position regex:', pos_regex)
 # building cols to extract
@ -156,30 +154,29 @@ if in_filename_master == 'original_tanushree_data_v2.csv':
 else:
    core_cols = ['id'
                 , 'sample'
-                 , 'patient_id'
+                 #, 'patient_id'
-                 , 'strain'
+                 #, 'strain'
                 , 'lineage'
                 , 'sublineage' 
-                 , 'country'
+                 #, 'country'
                 , 'country_code'
                 , 'geographic_source'
                 #, 'region'
-                 , 'location'
+                 #, 'location'
-                 , 'host_body_site'
+                 #, 'host_body_site'
-                 , 'environment_material'
+                 #, 'environment_material'
-                 , 'host_status'
+                 #, 'host_status'
-                 , 'host_sex'
+                 #, 'host_sex'
-                 , 'submitted_host_sex'
+                 #, 'submitted_host_sex'
-                 , 'hiv_status'
+                 #, 'hiv_status'
-                 , 'HIV_status'
+                 #, 'HIV_status'
-                 , 'tissue_type'
+                 #, 'tissue_type'
-                 , 'isolation_source'
+                 #, 'isolation_source'
                 ,  resistance_col]
    variable_based_cols = [drug
                           , dr_muts_col
                           , other_muts_col]
                           #, resistance_col]
    cols_to_extract = core_cols + variable_based_cols
    print('Extracting', len(cols_to_extract), 'columns from master data')
@ -202,7 +199,7 @@ print('No. of NAs/column:' + '\n', meta_data.isna().sum()
 #%% Write check file
 check_file = outdir + '/' + gene.lower() + '_gwas.csv'
-meta_data.to_csv(check_file)
+meta_data.to_csv(check_file, index = False)
 print('Writing subsetted gwas data'
      , '\nFile', check_file
      , '\nDim:', meta_data.shape)
@ -217,9 +214,9 @@ print('Writing subsetted gwas data'
 # drug counts: complete samples for OR calcs
 meta_data[drug].value_counts() 
 print('===========================================================\n'
-      , 'RESULT: No. of Sus and Res samples:\n', meta_data[drug].value_counts()
+      , 'RESULT: No. of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts()
 	  , '\n===========================================================\n'
-      , 'RESULT: Percentage of Sus and Res samples:\n', meta_data[drug].value_counts(normalize = True)*100
+      , 'RESULT: Percentage of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts(normalize = True)*100
 		, '\n===========================================================')
 #%%
@ -1173,7 +1170,7 @@ del(out_filename_metadata_poscounts)
 #%% Write file: gene_metadata (i.e gene_LF1)
 # where each row has UNIQUE mutations NOT unique sample ids
-out_filename_metadata = gene.lower() + '_metadata.csv'
+out_filename_metadata = gene.lower() + '_metadata_raw.csv'
 outfile_metadata = outdir + '/' + out_filename_metadata
 print('Writing file: LF formatted data'
      , '\nFile:', outfile_metadata
--- a/scripts/ks_test_PS.R
+++ b/scripts/ks_test_PS.R
@ -0,0 +1,211 @@
 #!/usr/bin/env Rscript
 #########################################################
 # TASK: KS test for PS/DUET lineage distributions
 #=======================================================================
 #=======================================================================
 # working dir and loading libraries
 getwd()
 setwd("~/git/LSHTM_analysis/scripts/")
 getwd()
 #source("/plotting/Header_TT.R")
 #source("../barplot_colour_function.R")
 #require(data.table)
 source("plotting/combining_dfs_plotting.R")
 # should return the following dfs, directories and variables
 # PS combined: 
 # 1) merged_df2
 # 2) merged_df2_comp
 # 3) merged_df3
 # 4) merged_df3_comp
 # LIG combined: 
 # 5) merged_df2_lig
 # 6) merged_df2_comp_lig
 # 7) merged_df3_lig
 # 8) merged_df3_comp_lig
 # 9) my_df_u
 # 10) my_df_u_lig
 cat(paste0("Directories imported:"
           , "\ndatadir:", datadir
           , "\nindir:", indir
           , "\noutdir:", outdir
           , "\nplotdir:", plotdir))
 cat(paste0("Variables imported:"
           , "\ndrug:", drug
           , "\ngene:", gene
           , "\ngene_match:", gene_match
           , "\nAngstrom symbol:", angstroms_symbol
           , "\nNo. of duplicated muts:", dup_muts_nu
           , "\nNA count for ORs:", na_count
           , "\nNA count in df2:", na_count_df2
           , "\nNA count in df3:", na_count_df3))       
 ###########################
 # Data for  stats
 # you need merged_df2 or merged_df2_comp
 # since this is one-many relationship 
 # i.e the same SNP can belong to multiple lineages
 # using the _comp dataset means
 # we lose some muts and at this level, we should use
 # as much info as available, hence use df with NA
 ###########################
 # REASSIGNMENT
 my_df  = merged_df2
 # delete variables not required
 rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
 # quick checks
 colnames(my_df)
 str(my_df)
 # Ensure correct data type in columns to plot: need to be factor
 is.factor(my_df$lineage)
 my_df$lineage = as.factor(my_df$lineage)
 is.factor(my_df$lineage)
 table(my_df$mutation_info); str(my_df$mutation_info)
 # subset df with dr muts only
 my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide") 
 table(my_df_dr$mutation_info)
 # stats for all muts and dr_muts
 # 1) for all muts
 # 2) for dr_muts
 #===========================
 table(my_df$lineage); str(my_df$lineage)
 table(my_df_dr$lineage); str(my_df_dr$lineage)
 # subset only lineages1-4
 sel_lineages = c("lineage1"
                 , "lineage2"
                 , "lineage3"
                 , "lineage4")
 # subset and refactor: all muts
 df_lin = subset(my_df, subset = lineage %in% sel_lineages)
 df_lin$lineage = factor(df_lin$lineage)
 # subset and refactor: dr muts
 df_lin_dr = subset(my_df_dr, subset = lineage %in% sel_lineages)
 df_lin_dr$lineage = factor(df_lin_dr$lineage)
 #{RESULT: No of samples within lineage}
 table(df_lin$lineage) 
 table(df_lin_dr$lineage)
 #{Result: No. of unique mutations the 4 lineages contribute to}
 length(unique(df_lin$mutationinformation))
 length(unique(df_lin_dr$mutationinformation))
 # COMPARING DISTRIBUTIONS
 #================
 # ALL mutations
 #=================
 head(df_lin$lineage)
 df_lin$lineage = as.character(df_lin$lineage)
 lin1 = df_lin[df_lin$lineage == "lineage1",]$duet_scaled
 lin2 = df_lin[df_lin$lineage == "lineage2",]$duet_scaled
 lin3 = df_lin[df_lin$lineage == "lineage3",]$duet_scaled
 lin4 = df_lin[df_lin$lineage == "lineage4",]$duet_scaled
 # ks test
 lin12 = ks.test(lin1,lin2) 
 lin12_df = as.data.frame(cbind(lin12$data.name, lin12$p.value))
 lin13 = ks.test(lin1,lin3) 
 lin13_df = as.data.frame(cbind(lin13$data.name, lin13$p.value))
 lin14 = ks.test(lin1,lin4) 
 lin14_df = as.data.frame(cbind(lin14$data.name, lin14$p.value))
 lin23 = ks.test(lin2,lin3)
 lin23_df = as.data.frame(cbind(lin23$data.name, lin23$p.value))
 lin24 = ks.test(lin2,lin4)  
 lin24_df = as.data.frame(cbind(lin24$data.name, lin24$p.value))
 lin34 = ks.test(lin3,lin4)
 lin34_df = as.data.frame(cbind(lin34$data.name, lin34$p.value))
 ks_results_all = rbind(lin12_df 
                   , lin13_df
                   , lin14_df
                   , lin23_df
                   , lin24_df
                   , lin34_df)
 #p-value < 2.2e-16
 rm(lin12, lin12_df
   , lin13, lin13_df
   , lin14, lin14_df
   , lin23, lin23_df
   , lin24, lin24_df
   , lin34, lin34_df)
 #================
 # DRUG mutations
 #=================
 head(df_lin_dr$lineage)
 df_lin_dr$lineage = as.character(df_lin_dr$lineage)
 lin1_dr = df_lin_dr[df_lin_dr$lineage == "lineage1",]$duet_scaled
 lin2_dr = df_lin_dr[df_lin_dr$lineage == "lineage2",]$duet_scaled
 lin3_dr = df_lin_dr[df_lin_dr$lineage == "lineage3",]$duet_scaled
 lin4_dr = df_lin_dr[df_lin_dr$lineage == "lineage4",]$duet_scaled
 # ks test: dr muts
 lin12_dr = ks.test(lin1_dr,lin2_dr) 
 lin12_df_dr = as.data.frame(cbind(lin12_dr$data.name, lin12_dr$p.value))
 lin13_dr = ks.test(lin1_dr,lin3_dr) 
 lin13_df_dr = as.data.frame(cbind(lin13_dr$data.name, lin13_dr$p.value))
 lin14_dr = ks.test(lin1_dr,lin4_dr) 
 lin14_df_dr = as.data.frame(cbind(lin14_dr$data.name, lin14_dr$p.value))
 lin23_dr = ks.test(lin2_dr,lin3_dr)
 lin23_df_dr = as.data.frame(cbind(lin23_dr$data.name, lin23_dr$p.value))
 lin24_dr = ks.test(lin2_dr,lin4_dr)  
 lin24_df_dr = as.data.frame(cbind(lin24_dr$data.name, lin24_dr$p.value))
 lin34_dr = ks.test(lin3_dr,lin4_dr)
 lin34_df_dr = as.data.frame(cbind(lin34_dr$data.name, lin34_dr$p.value))
 ks_results_dr = rbind(lin12_df_dr 
                      , lin13_df_dr
                      , lin14_df_dr
                      , lin23_df_dr
                      , lin24_df_dr
                      , lin34_df_dr)
 ks_results_combined = cbind(ks_results_all, ks_results_dr)
 my_colnames = c("Lineage_comparisons"
                , paste0("All_mutations n=", nrow(df_lin))
                , paste0("Drug_associated_mutations n=", nrow(df_lin_dr)))
 my_colnames
 # select the output columns
 ks_results_combined_f = ks_results_combined[,c(1,2,4)]
 colnames(ks_results_combined_f) = my_colnames
 ks_results_combined_f 
 #=============
 # write output file
 #=============
 ks_results = paste0(outdir,"/results/ks_results.csv")
 write.csv(ks_results_combined_f, ks_results, row.names = F)
--- a/scripts/or_kinship_link.py
+++ b/scripts/or_kinship_link.py
@ -104,7 +104,7 @@ or_df.columns
 #%% snp_info file: master and gene specific ones
 # gene info
-info_df2  = pd.read_csv(gene_info, sep = '\t', header = 0) #447, 10 
+info_df2  = pd.read_csv(gene_info, sep = '\t', header = 0) #447, 11
 #info_df2  = pd.read_csv(gene_info, sep = ',', header = 0) #447 10   
 mis_mut_cover = (info_df2['chromosome_number'].nunique()/info_df2['chromosome_number'].count()) * 100
 print('*****RESULT*****'
@ -212,7 +212,7 @@ else:
 #PENDING Jody's reply 
 # !!!!!!!! 
-# drop nan from dfm2_mis as these are not useful
+# drop nan from dfm2_mis as these are not useful and JP confirmed the same
 print('Dropping NAs before further processing...')
 dfm2_mis = dfm2[dfm2['mut_type'].notnull()]
 # !!!!!!!!
--- a/scripts/plotting/autoviz.py
+++ b/scripts/plotting/autoviz.py
@ -1,45 +0,0 @@
 #!/usr/bin/env python3
 #=======================================================================
 #%% useful links
 #https://towardsdatascience.com/autoviz-automatically-visualize-any-dataset-ba2691a8b55a
 #https://pypi.org/project/autoviz/
 #=======================================================================
 import os, sys
 import pandas as pd
 import numpy as np
 import re
 import argparse
 from autoviz.AutoViz_Class import AutoViz_Class
 homedir = os.path.expanduser('~')
 os.chdir(homedir + '/git/LSHTM_analysis/scripts')
 #%%============================================================================
 # variables
 gene = 'pncA'
 drug = 'pyrazinamide'
 #%%============================================================================
 #==============
 # directories
 #==============
 datadir = homedir + '/' + 'git/Data'
 indir = datadir + '/' + drug + '/input'
 outdir = datadir + '/' + drug + '/output'
 #=======
 # input
 #=======
 in_filename_plotting = 'car_design.csv'
 in_filename_plotting = gene.lower() + '_all_params.csv'
 infile_plotting = outdir + '/' + in_filename_plotting
 print('plotting file: ', infile_plotting
      , '\n============================================================')
 #=======================================================================
 plotting_df = pd.read_csv(infile_plotting, sep = ',')
 #Instantiate the AutoViz class
 AV = AutoViz_Class()
 df = AV.AutoViz(infile_plotting)
 #df2 = AV.AutoViz(plotting_df)
 plotting_df.columns[~plotting_df.columns.isin(df.columns)]
--- a/scripts/plotting/barplots_subcolours_aa_PS.R
+++ b/scripts/plotting/barplots_subcolours_aa_PS.R
@ -1,5 +1,4 @@
-#!/usr/bin/env Rscript   
+#!/usr/bin/env Rscript   getwd()
 getwd()
 setwd("~/git/LSHTM_analysis/scripts/plotting")
 getwd()
@ -19,8 +18,7 @@ getwd()
 library(ggplot2)
 library(data.table)
 source("barplot_colour_function.R")
-#source("subcols_axis.R")
+source("subcols_axis.R")
 source("subcols_axis_PS.R")
 # should return the following dfs, directories and variables
 # mut_pos_cols
@ -161,9 +159,7 @@ min(df$duet_scaled)
 max(df$duet_scaled)
 # sanity checks
 # very important!!!!
 tapply(df$duet_scaled, df$duet_outcome, min)
 tapply(df$duet_scaled, df$duet_outcome, max)
 # My colour FUNCTION: based on group and subgroup
@ -241,7 +237,7 @@ outPlot = g +
         , axis.ticks.x = element_blank()) +
  labs(title = ""
       #title = my_title
-       , x = "position"
+       , x = "Position"
       , y = "Frequency")
 print(outPlot)
--- a/scripts/plotting/columns_all_params.csv
+++ b/scripts/plotting/columns_all_params.csv
@ -1,87 +0,0 @@
 ,x,,changes,
 1,mutationinformation,,Mutationinformation,
 2,wild_type,,,consider...wild_aa
 3,position,,Position,
 4,mutant_type,,,consider...mutant_aa
 5,chain,,,
 6,ligand_id,,,
 7,ligand_distance,,,
 8,duet_stability_change,,,
 9,duet_outcome,,DUET_outcome,
 10,ligand_affinity_change,,,
 11,ligand_outcome,,Lig_outcome,
 12,duet_scaled,,ratioDUET,
 13,affinity_scaled,,ratioPredAff,
 14,wild_pos,,WildPos,
 15,wild_chain_pos,,,
 16,ddg,,,
 17,contacts,,,
 18,electro_rr,,,
 19,electro_mm,,,
 20,electro_sm,,,
 21,electro_ss,,,
 22,disulfide_rr,,,
 23,disulfide_mm,,,
 24,disulfide_sm,,,
 25,disulfide_ss,,,
 26,hbonds_rr,,,
 27,hbonds_mm,,,
 28,hbonds_sm,,,
 29,hbonds_ss,,,
 30,partcov_rr,,,
 31,partcov_mm,,,
 32,partcov_sm,,,
 33,partcov_ss,,,
 34,vdwclashes_rr,,,
 35,vdwclashes_mm,,,
 36,vdwclashes_sm,,,
 37,vdwclashes_ss,,,
 38,volumetric_rr,,,
 39,volumetric_mm,,,
 40,volumetric_sm,,,
 41,volumetric_ss,,,
 42,wild_type_dssp,,,
 43,asa,,,
 44,rsa,,,
 45,ss,,,
 46,ss_class,,,
 47,chain_id,,,
 48,wild_type_kd,,,
 49,kd_values,,,
 50,rd_values,,,
 51,wt_3letter_caps,,,
 52,mutation,,,
 53,af,,,
 54,beta_logistic,,,
 55,or_logistic,,,
 56,pval_logistic,,,
 57,se_logistic,,,
 58,zval_logistic,,,
 59,ci_low_logistic,,,
 60,ci_hi_logistic,,,
 61,or_mychisq,,,
 62,or_fisher,,,
 63,pval_fisher,,,
 64,ci_low_fisher,,,
 65,ci_hi_fisher,,,
 66,est_chisq,,,
 67,pval_chisq,,,
 68,chromosome_number,,,
 69,ref_allele,,,
 70,alt_allele,,,
 71,mut_type,,,
 72,gene_id,,,
 73,gene_number,,,
 74,mut_region,,,
 75,mut_info,,,
 76,chr_num_allele,,,
 77,wt_3let,,,
 78,mt_3let,,,
 79,af_kin,,,
 80,or_kin,,,
 81,pwald_kin,,,
 82,beta_kin,,,
 83,se_kin,,,
 84,logl_h1_kin,,,
 85,l_remle_kin,,,
 86,n_miss,,,
--- a/scripts/plotting/combining_dfs_plotting.R
+++ b/scripts/plotting/combining_dfs_plotting.R
@ -6,6 +6,7 @@
 # 2) <gene>_meta_data.csv
 # Output: 
 # 1) muts with opposite effects on stability
 # 2) large combined df including NAs for AF, OR,etc
 # 		Dim: same no. of rows as gene associated meta_data_with_AFandOR
 # 3) small combined df including NAs for AF, OR, etc.
@ -36,17 +37,23 @@ source("plotting_data.R")
 # dup_muts
 cat("Directories imported:"
    , "\n===================="
    , "\ndatadir:", datadir
    , "\nindir:", indir
    , "\noutdir:", outdir
    , "\nplotdir:", plotdir)
 cat("Variables imported:"
    , "\n====================="
    , "\ndrug:", drug
    , "\ngene:", gene
    , "\ngene_match:", gene_match
-    , "\nLength of upos:", length(upos)
+    , "\nAngstrom symbol:", angstroms_symbol
-    , "\nAngstrom symbol:", angstroms_symbol)     
+    , "\nNo. of duplicated muts:", dup_muts_nu
    , "\ndr_muts_col:", dr_muts_col
    , "\nother_muts_col:", other_muts_col
    , "\ndrtype_col:", resistance_col)
 # clear excess variable
 rm(my_df, upos, dup_muts)
@ -57,7 +64,6 @@ rm(my_df, upos, dup_muts)
 #in_file1: output of plotting_data.R
 # my_df_u
 # infile 2: gene associated meta data
 #in_filename_gene_metadata = paste0(tolower(gene),  "_meta_data_with_AFandOR.csv")
 in_filename_gene_metadata = paste0(tolower(gene),  "_metadata.csv")
@ -94,6 +100,8 @@ gene_metadata <- read.csv(infile_gene_metadata
                      , header = T)
 cat("Dim:", dim(gene_metadata))
 table(gene_metadata$mutation_info)
 # counting NAs in AF, OR cols
 # or_mychisq
@ -124,6 +132,7 @@ if (identical(sum(is.na(my_df_u$or_kin))
      , "\nNA in AF:", sum(is.na(my_df_u$af_kin)))
 }
 str(gene_metadata)
 ###################################################################
 #                           combining: PS
@ -145,7 +154,7 @@ merging_cols = intersect(colnames(my_df_u), colnames(gene_metadata))
 cat(paste0("Merging dfs with NAs: big df (1-many relationship b/w id & mut)"
           , "\nNo. of merging cols:", length(merging_cols)
-           , "\nMerging columns identified:\n"))
+           , "\nMerging columns identified:"))
 print(merging_cols)
 # important checks!
@ -160,7 +169,7 @@ merged_df2 = merge(x = gene_metadata
                  , by = merging_cols
                  , all.y = T)
-cat("Dim of merged_df2: ", dim(merged_df2), "\n")
+cat("Dim of merged_df2: ", dim(merged_df2))
 head(merged_df2$position)
 # sanity check 
@ -170,10 +179,10 @@ if(nrow(gene_metadata) == nrow(merged_df2)){
      ,"\nExpected no. of rows: ",nrow(gene_metadata) 
      ,"\nGot no. of rows: ", nrow(merged_df2))
 } else{
-  cat("\nFAIL: nrow(merged_df2)!= nrow(gene associated gene_metadata)"
+  cat("FAIL: nrow(merged_df2)!= nrow(gene associated gene_metadata)"
      , "\nExpected no. of rows after merge: ", nrow(gene_metadata)
      , "\nGot no. of rows: ", nrow(merged_df2)
-      , "\nFinding discrepancy\n")
+      , "\nFinding discrepancy")
  merged_muts_u = unique(merged_df2$mutationinformation)
  meta_muts_u = unique(gene_metadata$mutationinformation)
    # find the index where it differs
@ -227,6 +236,7 @@ if (identical(sum(is.na(merged_df3$or_kin))
      , "\nNA in AF:", sum(is.na(merged_df3$af_kin)))
 }
 #=========================
 # Merge3: merged_df2_comp
 # same as merge 1 but excluding NAs from ORs, etc.
@ -237,6 +247,7 @@ cat("Merging dfs without any NAs: big df (1-many relationship b/w id & mut)"
 na_count_df2 = sum(is.na(merged_df2$af))
 merged_df2_comp = merged_df2[!is.na(merged_df2$af),] 
 # sanity check: no +-1 gymnastics
 cat("Checking nrows in merged_df2_comp")
 if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count_df2)){
@ -250,16 +261,18 @@ if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count_df2)){
      ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2
      ,"\nGot no. of rows: ", nrow(merged_df2_comp))
 }
 #=========================
 # Merge4: merged_df3_comp
 # same as merge 2 but excluding NAs from ORs, etc or 
 # remove duplicate mutation information
 #=========================
 na_count_df3 = sum(is.na(merged_df3$af))
 #merged_df3_comp = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] # a way
 merged_df3_comp = merged_df3[!is.na(merged_df3$af),] # another way
-cat("\nChecking nrows in merged_df3_comp")
+cat("Checking nrows in merged_df3_comp")
 if(nrow(merged_df3_comp) == (nrow(merged_df3) - na_count_df3)){
  cat("\nPASS: No. of rows match"
      ,"\nDim of merged_df3_comp: "
@ -267,12 +280,11 @@ if(nrow(merged_df3_comp) == (nrow(merged_df3) - na_count_df3)){
      , "\nNo. of rows: ", nrow(merged_df3_comp)
      , "\nNo. of cols: ", ncol(merged_df3_comp))
 }else{
-  cat("\nFAIL: No. of rows mismatch"
+  cat("FAIL: No. of rows mismatch"
      ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3
      ,"\nGot no. of rows: ", nrow(merged_df3_comp))
 }
 # alternate way of deriving merged_df3_comp
 foo = merged_df3[!is.na(merged_df3$af),]
 bar = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),]
@ -307,7 +319,8 @@ all.equal(foo, bar)
 # clear variables
 rm(foo, bar, gene_metadata
   , in_filename_params, infile_params, merging_cols
-   , in_filename_gene_metadata, infile_gene_metadata)
+   , in_filename_gene_metadata, infile_gene_metadata
   , merged_df2v2, merged_df2v3)
 #*************************
 #####################################################################
 #                       Combining: LIG
@ -334,4 +347,4 @@ if (nrow(merged_df3_lig) == nrow(my_df_u_lig)){
 #==========================================================================
 # end of script
-##=========================================================================
+##==========================================================================
--- a/scripts/plotting/combining_two_df_FIXME.R
+++ b/scripts/plotting/combining_two_df_FIXME.R
--- a/scripts/plotting/corr_plots.R
+++ b/scripts/plotting/corr_plots.R
@ -0,0 +1,263 @@
 #!/usr/bin/env Rscript
 #########################################################
 # TASK: Corr plots for PS and Lig 
 # Output: 1 svg
 #=======================================================================
 # working dir and loading libraries
 getwd()
 setwd("~/git/LSHTM_analysis/scripts/plotting/")
 getwd()
 source("Header_TT.R")
 require(cowplot)
 source("combining_dfs_plotting.R")
 # should return the following dfs, directories and variables
 # PS combined: 
 # 1) merged_df2
 # 2) merged_df2_comp
 # 3) merged_df3
 # 4) merged_df3_comp
 # LIG combined: 
 # 5) merged_df2_lig
 # 6) merged_df2_comp_lig
 # 7) merged_df3_lig
 # 8) merged_df3_comp_lig
 # 9) my_df_u
 # 10) my_df_u_lig
 cat(paste0("Directories imported:"
           , "\ndatadir:", datadir
           , "\nindir:", indir
           , "\noutdir:", outdir
           , "\nplotdir:", plotdir))
 cat(paste0("Variables imported:"
           , "\ndrug:", drug
           , "\ngene:", gene
           , "\ngene_match:", gene_match
           , "\nAngstrom symbol:", angstroms_symbol
           , "\nNo. of duplicated muts:", dup_muts_nu
           , "\nNA count for ORs:", na_count
           , "\nNA count in df2:", na_count_df2
           , "\nNA count in df3:", na_count_df3))     
 #=======
 # output
 #=======
 # can't combine by cowplot because not ggplots
 #corr_plot_combined = "corr_combined.svg"
 #plot_corr_plot_combined  =  paste0(plotdir,"/", corr_plot_combined)
 # PS
 corr_ps = "corr_PS.svg"
 plot_corr_ps =  paste0(plotdir,"/", corr_ps)
 # LIG
 corr_lig = "corr_LIG.svg"
 plot_corr_lig =  paste0(plotdir,"/", corr_lig)
 ####################################################################
 #               end of loading libraries and functions                 #
 ########################################################################
 #%%%%%%%%%%%%%%%%%%%%%%%%%
 df_ps = merged_df3_comp 
 df_lig = merged_df3_comp_lig
 #%%%%%%%%%%%%%%%%%%%%%%%%%
 rm( merged_df2, merged_df2_comp, merged_df2_lig, merged_df2_comp_lig, my_df_u, my_df_u_lig)
 ########################################################################
 #               end of data extraction and cleaning for plots          #
 ########################################################################
 #===========================
 # Data for Correlation plots:PS
 #===========================
 table(df_ps$duet_outcome)
 # adding log cols 
 df_ps$log10_or_mychisq = log10(df_ps$or_mychisq)
 df_ps$neglog_pval_fisher = -log10(df_ps$pval_fisher)
 df_ps$log10_or_kin = log10(df_ps$or_kin)
 df_ps$neglog_pwald_kin = -log10(df_ps$pwald_kin)
 # subset data to generate pairwise correlations
 cols_to_select =  c("duet_scaled"
                    , "log10_or_mychisq"
                    , "neglog_pval_fisher"
                    , "log10_or_kin"
                    , "neglog_pwald_kin"
                    , "af"
                    , "duet_outcome"
                    , drug)
 corr_data_ps = df_ps[, cols_to_select]
 dim(corr_data_ps)
 #p_italic = substitute(paste("-Log(", italic('P'), ")"));p_italic 
 #p_adjusted_italic = substitute(paste("-Log(", italic('P adjusted'), ")"));p_adjusted_italic
 # assign nice colnames (for display)
 my_corr_colnames = c("DUET"
                     , "Log(OR)"
                     , "-Log(P)"
                     , "Log(OR adjusted)"
                     , "-Log(P wald)"
                     , "AF"
                     , "duet_outcome"
                     , drug)
 length(my_corr_colnames)
 colnames(corr_data_ps)
 colnames(corr_data_ps) <- my_corr_colnames
 colnames(corr_data_ps)
 #-----------------
 # generate corr PS plot
 #-----------------
 start = 1
 end = which(colnames(corr_data_ps) == drug); end # should be the last column
 offset = 1
 my_corr_ps = corr_data_ps[start:(end-offset)]
 head(my_corr_ps)
 #my_cols = c("#f8766d", "#00bfc4")
 # deep blue :#007d85
 # deep red: #ae301e
 cat("Corr plot PS:", plot_corr_ps)
 svg(plot_corr_ps, width = 15, height = 15)
 OutPlot1 = pairs.panels(my_corr_ps[1:(length(my_corr_ps)-1)]
             , method = "spearman" # correlation method
             , hist.col = "grey" ##00AFBB
             , density = TRUE  # show density plots
             , ellipses = F # show correlation ellipses
             , stars = T
             , rug = F
             , breaks = "Sturges"
             , show.points = T
             , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr_ps$duet_outcome))]
             , pch = 21
             , jitter = T
             #, alpha = .05
             #, points(pch = 19, col = c("#f8766d", "#00bfc4"))
             , cex = 3
             , cex.axis = 2.5
             , cex.labels = 2.1
             , cex.cor = 1
             , smooth = F
 )
 print(OutPlot1)
 dev.off()
 #===========================
 # Data for Correlation plots: LIG
 #===========================
 table(df_lig$ligand_outcome)
 df_lig$log10_or_mychisq = log10(df_lig$or_mychisq)
 df_lig$neglog_pval_fisher = -log10(df_lig$pval_fisher)
 df_lig$log10_or_kin = log10(df_lig$or_kin)
 df_lig$neglog_pwald_kin = -log10(df_lig$pwald_kin)
 # subset data to generate pairwise correlations
 cols_to_select =  c("affinity_scaled"
                    , "log10_or_mychisq"
                    , "neglog_pval_fisher"
                    , "log10_or_kin"
                    , "neglog_pwald_kin"
                    , "af"
                    , "ligand_outcome"
                    , drug)
 corr_data_lig = df_lig[, cols_to_select]
 dim(corr_data_lig)
 # assign nice colnames (for display)
 my_corr_colnames = c("Ligand Affinity"
                     , "Log(OR)"
                     , "-Log(P)"
                     , "Log(OR adjusted)"
                     , "-Log(P wald)"
                     , "AF"
                     , "ligand_outcome"
                     , drug)
 length(my_corr_colnames)
 colnames(corr_data_lig)
 colnames(corr_data_lig) <- my_corr_colnames
 colnames(corr_data_lig)
 #-----------------
 # generate corr LIG plot
 #-----------------
 start = 1
 end = which(colnames(corr_data_lig) == drug); end # should be the last column
 offset = 1
 my_corr_lig = corr_data_lig[start:(end-offset)]
 head(my_corr_lig)
 cat("Corr LIG plot:", plot_corr_lig)
 svg(plot_corr_lig, width = 15, height = 15)
 OutPlot2  = pairs.panels(my_corr_lig[1:(length(my_corr_lig)-1)]
                  , method = "spearman" # correlation method
                  , hist.col = "grey" ##00AFBB
                  , density = TRUE  # show density plots
                  , ellipses = F # show correlation ellipses
                  , stars = T
                  , rug = F
                  , breaks = "Sturges"
                  , show.points = T
                  , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr_lig$ligand_outcome))]
                  , pch = 21
                  , jitter = T
                  #, alpha = .05
                  #, points(pch = 19, col = c("#f8766d", "#00bfc4"))
                  , cex = 3
                  , cex.axis = 2.5
                  , cex.labels = 2.1
                  , cex.cor = 1
                  , smooth = F
 )
 print(OutPlot2)
 dev.off()
 #######################################################
--- a/scripts/plotting/dirs.R
+++ b/scripts/plotting/dirs.R
@ -0,0 +1,56 @@
 #!/usr/bin/env Rscript             
 #########################################################
 # TASK: formatting data that will be used for various plots
 # useful links
 #https://stackoverflow.com/questions/38851592/r-append-column-in-a-dataframe-with-frequency-count-based-on-two-columns
 #########################################################
 # working dir and loading libraries
 getwd()
 setwd("~/git/LSHTM_analysis/scripts/plotting")
 getwd()
 #source("Header_TT.R")
 library(ggplot2)
 library(data.table)
 library(dplyr)
 require("getopt", quietly = TRUE) #cmd parse arguments
 #========================================================
 # command line args
 #spec = matrix(c(
 #  "drug"   , "d", 1, "character",
 #  "gene"   , "g", 1, "character"
 #), byrow = TRUE, ncol = 4)
 #opt = getopt(spec)
 #drug = opt$druggene = opt$gene
 #if(is.null(drug)|is.null(gene)) {
 #  stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)")
 #}
 #========================================================
 # FIXME: change to cmd
 #%% variable assignment: input and output paths & filenames
 drug = "pyrazinamide"
 gene = "pncA"
 gene_match = paste0(gene,"_p.")
 cat(gene_match)
 #=============
 # directories and variables
 #=============
 datadir = paste0("~/git/Data")
 indir = paste0(datadir, "/", drug, "/input")
 outdir = paste0("~/git/Data", "/", drug, "/output")
 plotdir = paste0("~/git/Data", "/", drug, "/output/plots")
 dr_muts_col = paste0('dr_mutations_', drug)
 other_muts_col  = paste0('other_mutations_', drug)
 resistance_col = "drtype"
 #%%===============================================================
--- a/scripts/plotting/lineage_basic_barplot.R
+++ b/scripts/plotting/lineage_basic_barplot.R
@ -0,0 +1,214 @@
 #!/usr/bin/env Rscript       
 getwd()
 setwd("~/git/LSHTM_analysis/scripts/plotting/")
 getwd()
 #########################################################
 # TASK: Basic lineage barplot showing numbers
 # Output: Basic barplot with lineage samples and mut count
 ##########################################################
 # 				Installing and loading required packages 			 
 ##########################################################
 source("Header_TT.R")
 require(data.table)
 source("combining_dfs_plotting.R")
 # should return the following dfs, directories and variables
 # PS combined: 
 # 1) merged_df2
 # 2) merged_df2_comp
 # 3) merged_df3
 # 4) merged_df3_comp
 # LIG combined: 
 # 5) merged_df2_lig
 # 6) merged_df2_comp_lig
 # 7) merged_df3_lig
 # 8) merged_df3_comp_lig
 # 9) my_df_u
 # 10) my_df_u_lig
 cat("Directories imported:"
    , "\n===================="
    , "\ndatadir:", datadir
    , "\nindir:", indir
    , "\noutdir:", outdir
    , "\nplotdir:", plotdir)
 cat("Variables imported:"
    , "\n====================="
    , "\ndrug:", drug
    , "\ngene:", gene
    , "\ngene_match:", gene_match
    , "\nAngstrom symbol:", angstroms_symbol
    , "\nNo. of duplicated muts:", dup_muts_nu
    , "\nNA count for ORs:", na_count
    , "\nNA count in df2:", na_count_df2
    , "\nNA count in df3:", na_count_df3
    , "\ndr_muts_col:", dr_muts_col
    , "\nother_muts_col:", other_muts_col
    , "\ndrtype_col:", resistance_col)
 #===========
 # input
 #===========
 # output of combining_dfs_plotting.R
 #=======
 # output
 #=======
 # plot 1
 basic_bp_lineage = "basic_lineage_barplot.svg"
 plot_basic_bp_lineage  =  paste0(plotdir,"/", basic_bp_lineage)
 #=======================================================================
 #================
 # Data for plots:
 # you need merged_df2, comprehensive one
 # since this has one-many relationship
 # i.e the same SNP can belong to multiple lineages
 #================
 # REASSIGNMENT as necessary
 my_df  = merged_df2
 # clear excess variable
 rm(merged_df2_comp, merged_df3, merged_df3_comp)
 # quick checks
 colnames(my_df)
 str(my_df)
 # Ensure correct data type in columns to plot: need to be factor
 is.factor(my_df$lineage)
 my_df$lineage = as.factor(my_df$lineage)
 is.factor(my_df$lineage)
 #==========================
 # Plot: Lineage barplot
 # x = lineage y = No. of samples
 # col = Lineage
 # fill = lineage
 #============================
 table(my_df$lineage)
 as.data.frame(table(my_df$lineage))
 #=============
 # Data for plots
 #=============
 # REASSIGNMENT
 df <- my_df
 rm(my_df)
 # get freq count of positions so you can subset freq<1
 #setDT(df)[, lineage_count := .N, by = .(lineage)]
 #******************
 # generate plot: barplot of mutation by lineage
 #******************
 sel_lineages = c("lineage1"
                 , "lineage2"
                 , "lineage3"
                 , "lineage4"
                 #, "lineage5"
                 #, "lineage6"
                 #, "lineage7"
                 )
 df_lin = subset(df, subset = lineage %in% sel_lineages)
 # Create df with lineage inform & no. of unique mutations
 # per lineage and total samples within lineage
 # this is essentially barplot with two y axis
 bar = bar = as.data.frame(sel_lineages) #4, 1
 total_snps_u = NULL
 total_samples = NULL
 for (i in sel_lineages){
  #print(i)
  curr_total = length(unique(df$id)[df$lineage==i])
  total_samples = c(total_samples, curr_total)
  print(total_samples)
  foo = df[df$lineage==i,]
  print(paste0(i, "======="))
  print(length(unique(foo$mutationinformation)))
  curr_count = length(unique(foo$mutationinformation))
  total_snps_u = c(total_snps_u, curr_count)
 }
 print(total_snps_u)
 bar$num_snps_u = total_snps_u
 bar$total_samples = total_samples
 bar
 #*****************
 # generate plot: lineage barplot with two y-axis
 #https://stackoverflow.com/questions/13035295/overlay-bar-graphs-in-ggplot2
 #*****************
 y1 = bar$num_snps_u
 y2 = bar$total_samples
 x = sel_lineages
 to_plot = data.frame(x = x
                      , y1 = y1
                      , y2 = y2)
 to_plot
 # FIXME later: will be depricated!
 melted = melt(to_plot, id = "x")
 melted
 svg(plot_basic_bp_lineage)
 my_ats = 20 # axis text size
 my_als = 22 # axis label size
 g = ggplot(melted, aes(x = x
                 , y = value
                 , fill = variable))
 printFile = g + geom_bar(stat = "identity"
                         , position = position_stack(reverse = TRUE)
                         , alpha=.75
                         , colour='grey75') + 
  theme(axis.text.x = element_text(size = my_ats)
        , axis.text.y = element_text(size = my_ats
                                     #, angle = 30
                                     , hjust = 1
                                     , vjust = 0)
        , axis.title.x = element_text(size = my_als
                                      , colour = 'black')
        , axis.title.y = element_text(size = my_als
                                      , colour = 'black')
        , legend.position = "top"
        , legend.text = element_text(size = my_als)) + 
          #geom_text() +
          geom_label(aes(label = value)
                     , size = 5
                     , hjust = 0.5
                     , vjust = 0.5
                     , colour = 'black'
                     , show.legend = FALSE
                     #, check_overlap = TRUE
                     , position = position_stack(reverse = T)) + 
          labs(title = ''
               , x = ''
               , y = "Number"
               , fill = 'Variable'
               , colour = 'black') + 
          scale_fill_manual(values = c('grey50', 'gray75')
                            , name=''
                            , labels=c('Mutations', 'Total Samples')) +
          scale_x_discrete(breaks = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
                           , labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4'))
 print(printFile)
 dev.off()
--- a/scripts/plotting/lineage_dist_combined_PS.R
+++ b/scripts/plotting/lineage_dist_combined_PS.R
@ -0,0 +1,301 @@
 #!/usr/bin/env Rscript  
 #########################################################
 # TASK: Lineage dist plots: ggridges
 # Output: 2 SVGs for PS stability
 # 1) all muts
 # 2) dr_muts
 ##########################################################
 # 				Installing and loading required packages 			 
 ##########################################################
 getwd()
 setwd("~/git/LSHTM_analysis/scripts/plotting/")
 getwd()
 source("Header_TT.R")
 library(ggridges)
 source("combining_dfs_plotting.R")
 # PS combined: 
 # 1) merged_df2
 # 2) merged_df2_comp
 # 3) merged_df3
 # 4) merged_df3_comp
 # LIG combined: 
 # 5) merged_df2_lig
 # 6) merged_df2_comp_lig
 # 7) merged_df3_lig
 # 8) merged_df3_comp_lig
 # 9) my_df_u
 # 10) my_df_u_lig
 cat("Directories imported:"
    , "\n===================="
    , "\ndatadir:", datadir
    , "\nindir:", indir
    , "\noutdir:", outdir
    , "\nplotdir:", plotdir)
 cat("Variables imported:"
    , "\n====================="
    , "\ndrug:", drug
    , "\ngene:", gene
    , "\ngene_match:", gene_match
    , "\nAngstrom symbol:", angstroms_symbol
    , "\nNo. of duplicated muts:", dup_muts_nu
    , "\nNA count for ORs:", na_count
    , "\nNA count in df2:", na_count_df2
    , "\nNA count in df3:", na_count_df3
    , "\ndr_muts_col:", dr_muts_col
    , "\nother_muts_col:", other_muts_col
    , "\ndrtype_col:", resistance_col)
 #=======
 # output
 #=======
 lineage_dist_combined = "lineage_dist_combined_PS.svg"
 plot_lineage_dist_combined  =  paste0(plotdir,"/", lineage_dist_combined)
 #========================================================================
 ###########################
 # Data for plots
 # you need merged_df2 or merged_df2_comp
 # since this is one-many relationship 
 # i.e the same SNP can belong to multiple lineages
 # using the _comp dataset means
 # we lose some muts and at this level, we should use
 # as much info as available, hence use df with NA
 ###########################
 # REASSIGNMENT
 my_df  = merged_df2
 # delete variables not required
 rm(my_df_u, merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
 # quick checks
 colnames(my_df)
 str(my_df)
 # Ensure correct data type in columns to plot: need to be factor
 is.factor(my_df$lineage)
 my_df$lineage = as.factor(my_df$lineage)
 is.factor(my_df$lineage)
 table(my_df$mutation_info)
 # subset df with dr muts only
 my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide") 
 table(my_df_dr$mutation_info)
 ########################################################################
 #               end of data extraction and cleaning for plots          #
 ########################################################################
 #==========================
 # Plot 1: ALL Muts
 # x = mcsm_values, y = dist
 # fill = stability
 #============================
 my_plot_name = 'lineage_dist_PS.svg'
 plot_lineage_duet  =  paste0(plotdir,"/", my_plot_name)
 #===================
 # Data for plots
 #===================
 table(my_df$lineage); str(my_df$lineage)
 # subset only lineages1-4
 sel_lineages = c("lineage1"
                 , "lineage2"
                 , "lineage3"
                 , "lineage4"
                 #, "lineage5"
                 #, "lineage6"
                 #, "lineage7"
                 )
 # uncomment as necessary
 df_lin = subset(my_df, subset = lineage %in% sel_lineages )
 table(df_lin$lineage)
 # refactor
 df_lin$lineage = factor(df_lin$lineage)
 sum(table(df_lin$lineage)) #{RESULT: Total number of samples for lineage}
 table(df_lin$lineage)#{RESULT: No of samples within lineage}
 length(unique(df_lin$mutationinformation))#{Result: No. of unique mutations the 4 lineages contribute to}
 length(df_lin$mutationinformation)
 u2 = unique(my_df$mutationinformation)
 u = unique(df_lin$mutationinformation)
 check = u2[!u2%in%u]; print(check) #{Muts not present within selected lineages}
 #%%%%%%%%%%%%%%%%%%%%%%%%%
 # REASSIGNMENT
 df <- df_lin
 #%%%%%%%%%%%%%%%%%%%%%%%%%
 rm(df_lin)
 #******************
 # generate distribution plot of lineages
 #******************
 # 2 : ggridges (good!)
 my_ats = 15 # axis text size
 my_als = 20 # axis label size
 my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4'
              #, 'Lineage 5', 'Lineage 6', 'Lineage 7'
              )
 names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4'
                    # , 'lineage5', 'lineage6', 'lineage7'
                     )
 # check plot name
 plot_lineage_duet
 # output svg
 #svg(plot_lineage_duet)
 p1 = ggplot(df, aes(x = duet_scaled
                            , y = duet_outcome))+
  #printFile=geom_density_ridges_gradient(
  geom_density_ridges_gradient(aes(fill = ..x..)
                                , scale = 3
                                , size = 0.3 ) +
  facet_wrap( ~lineage
              , scales = "free"
             #, switch = 'x'
              , labeller = labeller(lineage = my_labels) ) +
  coord_cartesian( xlim = c(-1, 1)) +
  scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4")
                        , name = "DUET" ) + 
  theme(axis.text.x = element_text(size = my_ats
                                     , angle = 90
                                     , hjust = 1
                                     , vjust = 0.4)
         , axis.text.y = element_blank()
         , axis.title.x = element_blank()
         , axis.title.y = element_blank()
         , axis.ticks.y = element_blank()
         , plot.title = element_blank()
         , strip.text = element_text(size = my_als)
         , legend.text = element_text(size = my_als-5)
         , legend.title = element_text(size = my_als)
 ) 
 print(p1)
 #dev.off()
 #######################################################################
 # lineage distribution plot for dr_muts
 #######################################################################
 #==========================
 # Plot 2: dr muts ONLY
 # x = mcsm_values, y = dist
 # fill = stability
 #============================
 my_plot_name_dr = 'lineage_dist_dr_muts_PS.svg'
 plot_lineage_dr_duet  =  paste0(plotdir,"/", my_plot_name_dr)
 #===================
 # Data for plots
 #===================
 table(my_df_dr$lineage); str(my_df_dr$lineage)
 # uncomment as necessary
 df_lin_dr = subset(my_df_dr, subset = lineage %in% sel_lineages)
 table(df_lin_dr$lineage)
 # refactor
 df_lin_dr$lineage = factor(df_lin_dr$lineage)
 sum(table(df_lin_dr$lineage)) #{RESULT: Total number of samples for lineage}
 table(df_lin_dr$lineage)#{RESULT: No of samples within lineage}
 length(unique(df_lin_dr$mutationinformation))#{Result: No. of unique mutations the 4 lineages contribute to}
 length(df_lin_dr$mutationinformation)
 u2 = unique(my_df_dr$mutationinformation)
 u = unique(df_lin_dr$mutationinformation)
 check = u2[!u2%in%u]; print(check) #{Muts not present within selected lineages}
 #%%%%%%%%%%%%%%%%%%%%%%%%%
 # REASSIGNMENT
 df_dr <- df_lin_dr
 #%%%%%%%%%%%%%%%%%%%%%%%%%
 rm(df_lin_dr)
 #******************
 # generate distribution plot of lineages
 #******************
 # 2 : ggridges (good!)
 my_ats = 15 # axis text size
 my_als = 20 # axis label size
 # check plot name
 plot_lineage_dr_duet
 # output svg
 #svg(plot_lineage_dr_duet)
 p2 = ggplot(df_dr, aes(x = duet_scaled
                           , y = duet_outcome))+
  #printFile=geom_density_ridges_gradient(
  geom_density_ridges_gradient(aes(fill = ..x..)
                               , scale = 3
                               , size = 0.3 ) +
  facet_wrap( ~lineage
              , scales = "free"
              #, switch = 'x'
              , labeller = labeller(lineage = my_labels) ) +
  coord_cartesian( xlim = c(-1, 1)
                   #, ylim = c(0, 6)
                   #, clip = "off" 
  ) +
  scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4")
                       , name = "DUET" ) + 
  theme(axis.text.x = element_text(size = my_ats
                                   , angle = 90
                                   , hjust = 1
                                   , vjust = 0.4)
        , axis.text.y = element_blank()
        , axis.title.x = element_blank()
        , axis.title.y = element_blank()
        , axis.ticks.y = element_blank()
        , plot.title = element_blank()
        , strip.text = element_text(size = my_als)
        , legend.text = element_text(size = 10)
        , legend.title = element_text(size = my_als)
        #, legend.position = "none"
  ) 
 print(p2)
 #dev.off()
 ########################################################################
 #==============
 # combine plot
 #===============
 svg(plot_lineage_dist_combined, width = 12, height = 6)
 printFile = cowplot::plot_grid(p1, p2
                               , label_size = my_als+10)
 print(printFile)
 dev.off()
--- a/scripts/plotting/opp_mcsm_muts.R
+++ b/scripts/plotting/opp_mcsm_muts.R
@ -0,0 +1,96 @@
 #!/usr/bin/env Rscript      
 #########################################################
 # TASK: To write muts with opposite effects on
 # protomer and ligand stability
 #########################################################
 # working dir and loading libraries
 getwd()
 setwd("~/git/LSHTM_analysis/scripts/plotting/")
 getwd()
 source("plotting_data.R")
 # should return the following dfs, directories and variables
 # my_df
 # my_df_u
 # my_df_u_lig
 # dup_muts
 cat(paste0("Directories imported:"
           , "\ndatadir:", datadir
           , "\nindir:", indir
           , "\noutdir:", outdir
           , "\nplotdir:", plotdir))
 cat(paste0("Variables imported:"
           , "\ndrug:", drug
           , "\ngene:", gene
           , "\ngene_match:", gene_match
           , "\nLength of upos:", length(upos)
           , "\nAngstrom symbol:", angstroms_symbol))       
 # clear excess variable
 rm(my_df, upos, dup_muts)
 #========================================================
 #===========
 # input
 #===========
 #in_file1: output of plotting_data.R
 # my_df_u
 #===========
 # output
 #===========
 # mutations with opposite effects
 out_filename_opp_muts = paste0(tolower(gene), "_muts_opp_effects.csv")
 outfile_opp_muts = paste0(outdir, "/", out_filename_opp_muts)
 #%%===============================================================
 # spelling Correction 1: DUET incase American spelling needed!
 table(my_df_u$duet_outcome); sum(table(my_df_u$duet_outcome) )
 #my_df_u$duet_outcome[my_df_u$duet_outcome=="Stabilising"] <- "Stabilizing"
 #my_df_u$duet_outcome[my_df_u$duet_outcome=="Destabilising"] <- "Destabilizing"
 # spelling Correction 2: Ligand incase American spelling needed!
 table(my_df_u$ligand_outcome); sum(table(my_df_u$ligand_outcome) )
 #my_df_u$ligand_outcome[my_df_u$ligand_outcome=="Stabilising"] <- "Stabilizing"
 #my_df_u$ligand_outcome[my_df_u$ligand_outcome=="Destabilising"] <- "Destabilizing"
 # muts with opposing effects on protomer and ligand stability
 table(my_df_u$duet_outcome != my_df_u$ligand_outcome)
 changes = my_df_u[which(my_df_u$duet_outcome != my_df_u$ligand_outcome),]
 # sanity check: redundant, but uber cautious!
 dl_i = which(my_df_u$duet_outcome != my_df_u$ligand_outcome)
 ld_i = which(my_df_u$ligand_outcome != my_df_u$duet_outcome)
 cat("Identifying muts with opposite stability effects")
 if(nrow(changes) == (table(my_df_u$duet_outcome != my_df_u$ligand_outcome)[[2]]) & identical(dl_i,ld_i)) {
  cat("PASS: muts with opposite effects on stability and affinity correctly identified"
        , "\nNo. of such muts: ", nrow(changes))
 }else {
  cat("FAIL: unsuccessful in extracting muts with changed stability effects")
 }
 #==========================
 # write file: changed muts
 #==========================
 write.csv(changes, outfile_opp_muts)
 cat("Finished writing file for muts with opp effects:"
    , "\nFilename: ", outfile_opp_muts
    , "\nDim:", dim(changes))
 # clear variables
 rm(out_filename_opp_muts, outfile_opp_muts)
 rm(changes, dl_i, ld_i)
 # count na in each column
 na_count = sapply(my_df_u, function(y) sum(length(which(is.na(y))))); na_count
 df_ncols = ncol(my_df_u)
 #===================================== end of script
--- a/scripts/plotting/or_plots_combined.R
+++ b/scripts/plotting/or_plots_combined.R
@ -0,0 +1,192 @@
 #!/usr/bin/env Rscript
 #########################################################
 # TASK: Bubble plot of OR for PS and Lig 
 # Output: 1 svg
 #=======================================================================
 # working dir and loading libraries
 getwd()
 setwd("~/git/LSHTM_analysis/scripts/plotting/")
 getwd()
 source("Header_TT.R")
 require(cowplot)
 source("combining_dfs_plotting.R")
 # should return the following dfs, directories and variables
 # PS combined: 
 # 1) merged_df2
 # 2) merged_df2_comp
 # 3) merged_df3
 # 4) merged_df3_comp
 # LIG combined: 
 # 5) merged_df2_lig
 # 6) merged_df2_comp_lig
 # 7) merged_df3_lig
 # 8) merged_df3_comp_lig
 # 9) my_df_u
 # 10) my_df_u_lig
 cat(paste0("Directories imported:"
           , "\ndatadir:", datadir
           , "\nindir:", indir
           , "\noutdir:", outdir
           , "\nplotdir:", plotdir))
 cat(paste0("Variables imported:"
           , "\ndrug:", drug
           , "\ngene:", gene
           , "\ngene_match:", gene_match
           , "\nAngstrom symbol:", angstroms_symbol
           , "\nNo. of duplicated muts:", dup_muts_nu
           , "\nNA count for ORs:", na_count
           , "\nNA count in df2:", na_count_df2
           , "\nNA count in df3:", na_count_df3))       
 #=========================
 #=======
 # output
 #=======
 or_combined = "or_combined_PS_LIG.svg"
 plot_or_combined  =  paste0(plotdir,"/", or_combined)
 #or_kin_combined = "or_kin_combined_PS_LIG.svg"
 #plot_or_kin_combined  =  paste0(plotdir,"/", or_kin_combined)
 #=======================================================================
 ###########################
 # Data for OR and stability plots
 # you need merged_df3_comp
 # since these are matched 
 # to allow pairwise corr
 ###########################
 ps_df  = merged_df3_comp
 lig_df = merged_df3_comp_lig
 # Ensure correct data type in columns to plot: should be TRUE
 is.numeric(ps_df$or_mychisq)
 is.numeric(lig_df$or_mychisq)
 # delete variables not required
 rm(merged_df2, merged_df2_comp, merged_df2_lig, merged_df2_comp_lig, my_df_u, my_df_u_lig)
 #%% end of section 1
 # sanity check: should be <10
 if (max(lig_df$ligand_distance) < 10){
  print ("Sanity check passed: lig data is <10Ang")
 }else{
  print ("Error: data should be filtered to be within 10Ang")
 }
 #############
 # Plots: Bubble plot
 # x = Position, Y = stability
 # size of dots = OR
 # col: stability
 #############
 #-----------------
 # Plot 1: DUET vs OR by position as geom_points
 #-------------------
 my_ats = 20 # axis text size
 my_als = 22 # axis label size
 # Spelling Correction: made redundant as already corrected at the source
 #ps_df$duet_outcome[ps_df$duet_outcome=='Stabilizing'] <- 'Stabilising'
 #ps_df$duet_outcome[ps_df$duet_outcome=='Destabilizing'] <- 'Destabilising'
 table(ps_df$duet_outcome) ; sum(table(ps_df$duet_outcome))
 g1 = ggplot(ps_df, aes(x = factor(position)
                   , y = duet_scaled))
 p1 = g1 + 
  geom_point(aes(col = duet_outcome
                 , size = or_mychisq))+
                 #, size = or_kin)) + # not good, almost like log(or)
  theme(axis.text.x = element_text(size = my_ats
                                   , angle = 90
                                   , hjust = 1
                                   , vjust = 0.4)
        , axis.text.y = element_text(size = my_ats
                                     , angle = 0
                                     , hjust = 1
                                     , vjust = 0)
        , axis.title.x = element_text(size = my_als)
        , axis.title.y = element_text(size = my_als) 
        , legend.text = element_text(size = my_als)
        , legend.title = element_text(size = my_als) ) +
  #, legend.key.size = unit(1, "cm")) +
  labs(title = ""
       , x = "Position"
       , y = "DUET(PS)"
       , size = "Odds Ratio"
       , colour = "DUET Outcome") +
  guides(colour = guide_legend(override.aes = list(size=4))) 
 p1 
 #-------------------
 # generate plot 2: Lig vs OR by position as geom_points
 #-------------------
 # Spelling Correction: made redundant as already corrected at the source
 #lig_df$ligand_outcome[lig_df$ligand_outcome=='Stabilizing'] <- 'Stabilising'
 #lig_df$ligand_outcome[lig_df$ligand_outcome=='Destabilizing'] <- 'Destabilising'
 table(lig_df$ligand_outcome)
 g2 = ggplot(lig_df, aes(x = factor(position)
                   , y = affinity_scaled))
 p2 = g2 + 
  geom_point(aes(col = ligand_outcome
                 , size = or_mychisq))+
                 #, size = or_kin)) + # not good! almost like log(or)
  theme(axis.text.x = element_text(size = my_ats
                                   , angle = 90
                                   , hjust = 1
                                   , vjust = 0.4)
        , axis.text.y = element_text(size = my_ats
                                     , angle = 0
                                     , hjust = 1
                                     , vjust = 0)
        , axis.title.x = element_text(size = my_als)
        , axis.title.y = element_text(size = my_als) 
        , legend.text = element_text(size = my_als)
        , legend.title = element_text(size = my_als) ) +
  #, legend.key.size = unit(1, "cm")) +
  labs(title = ""
       , x = "Position"
       , y = "Ligand Affinity"
       , size = "Odds Ratio"
       , colour = "Ligand Outcome"
       ) +
  guides(colour = guide_legend(override.aes = list(size=4))) 
 p2
 #======================
 # combine using cowplot
 #======================
 svg(plot_or_combined, width = 32, height = 12)
 #svg(plot_or_kin_combined, width = 32, height = 12)
 theme_set(theme_gray()) # to preserve default theme
 printFile = cowplot::plot_grid(plot_grid(p1, p2
                             , ncol = 1
                             , align = 'v'
                             , labels = c("", "")
                             , label_size = my_als+5))
 print(printFile)
 dev.off()
--- a/scripts/plotting/plotting_data.R
+++ b/scripts/plotting/plotting_data.R
@ -15,7 +15,10 @@ library(ggplot2)
 library(data.table)
 library(dplyr)
 source("dirs.R")
 require("getopt", quietly = TRUE) #cmd parse arguments
 #========================================================
 # command line args
 #spec = matrix(c(
@ -31,19 +34,6 @@ require("getopt", quietly = TRUE) #cmd parse arguments
 #  stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)")
 #}
 #========================================================
 #%% variable assignment: input and output paths & filenames
 drug = "pyrazinamide"
 gene = "pncA"
 gene_match = paste0(gene,"_p.")
 cat(gene_match)
 #=============
 # directories
 #=============
 datadir = paste0("~/git/Data")
 indir = paste0(datadir, "/", drug, "/input")
 outdir = paste0("~/git/Data", "/", drug, "/output")
 plotdir = paste0("~/git/Data", "/", drug, "/output/plots")
 #======
 # input
 #======
@ -52,6 +42,17 @@ in_filename_params = paste0(tolower(gene), "_all_params.csv")
 infile_params = paste0(outdir, "/", in_filename_params)
 cat(paste0("Input file 1:", infile_params) )
 cat('columns based on variables:\n'
      , drug
      , '\n'
      , dr_muts_col
      , '\n'
      , other_muts_col
      , "\n"
      , resistance_col
      , '\n===============================================================')
 #%%===============================================================
 ###########################
 # Read file: struct params
@ -62,9 +63,6 @@ my_df = read.csv(infile_params, header = T)
 cat("\nInput dimensions:", dim(my_df)) 
 # quick checks
 #colnames(my_df)
 #str(my_df)
 ###########################
 # extract unique mutation entries
--- a/scripts/plotting/ps_plots_combined.R
+++ b/scripts/plotting/ps_plots_combined.R
@ -0,0 +1,251 @@
 #!/usr/bin/env Rscript  
 #########################################################
 # TASK: AF, OR and stability plots: PS
 # Output: 1 SVG
 #########################################################
 # 				Installing and loading required packages 			 
 ##########################################################
 getwd()
 setwd("~/git/LSHTM_analysis/scripts/plotting/")
 getwd()
 source("Header_TT.R")
 library(ggplot2)
 library(data.table)
 #source("combining_dfs_plotting.R")
 source("barplot_colour_function.R")
 source("subcols_axis.R")
 ###########################
 # Data for PS plots
 # you need merged_df3_comp/merged_df_comp
 # since these have unique SNPs
 ###########################
 no_or_index = which(is.na(my_df_u_cols$or_mychisq))
 my_df_u_cols_clean = my_df_u_cols[-no_or_index,]
 #%%%%%%%%%%%%%%%%%%%%%%%%
 # REASSIGNMENT
 df = my_df_u_cols_clean
 #%%%%%%%%%%%%%%%%%%%%%%%%%
 cols_to_select = colnames(mut_pos_cols)
 mut_pos_cols_clean = df[colnames(df)%in%cols_to_select]
 mut_pos_cols_clean = unique(mut_pos_cols_clean[, 1:length(mut_pos_cols_clean)])
 ########################################################################
 #               end of data extraction and cleaning for plots          #
 ########################################################################
 #===========================
 # Barplot with axis colours:
 #===========================
 #================
 # Inspecting mut_pos_cols
 # position numbers and colours and assigning axis colours based on lab_fg
 # of the correct df
 # open file from desktop ("sample_axis_cols") for cross checking
 #================
 # very important!
 #my_axis_colours = mut_pos_cols$lab_fg
 if (nrow(mut_pos_cols_clean) == length(unique(df$position)) ){
  print("PASS: lengths checked, assigning axis colours")
  my_axis_colours = mut_pos_cols_clean$lab_fg
  cat("length of axis colours:", length(my_axis_colours)
      , "\nwhich corresponds to the number of positions on the x-axis of the plot")
 }else{
  print("FAIL:lengths mismatch, could not assign axis colours")
  quit()
 }
 # unique positions
 upos = unique(df$position); length(upos)
 table(df$duet_outcome)
 # add frequency of positions (from lib data.table)
 setDT(df)[, pos_count := .N, by = .(position)]
 # this is cummulative
 table(df$pos_count)
 # use group by on this
 library(dplyr)
 snpsBYpos_df <- df %>%
  group_by(position) %>%
  summarize(snpsBYpos = mean(pos_count))
 table(snpsBYpos_df$snpsBYpos)
 snp_count = sort(unique(snpsBYpos_df$snpsBYpos))
 # sanity checks
 # should be a factor
 if (is.factor(df$duet_outcome)){
  print("duet_outcome is factor")
 }else{
  print("converting duet_outcome to factor")
  df$duet_outcome = as.factor(df$duet_outcome)
 }
 is.factor(df$duet_outcome)
 # should be -1 and 1
 min(df$duet_scaled)
 max(df$duet_scaled)
 # sanity checks
 tapply(df$duet_scaled, df$duet_outcome, min)
 tapply(df$duet_scaled, df$duet_outcome, max)
 # My colour FUNCTION: based on group and subgroup
 # in my case;
 # df = df
 # group = duet_outcome
 # subgroup = normalised score i.e duet_scaled
 # check unique values in normalised data
 u = unique(df$duet_scaled) 
 cat("No. of unique values in normalised data:", length(u))
 # Define group
 # Create an extra column called group which contains the "gp name and score" 
 # so colours can be generated for each unique values in this column
 my_grp = df$duet_scaled # no rounding
 df$group <- paste0(df$duet_outcome, "_", my_grp, sep = "") 
 # Call the function to create the palette based on the group defined above
 colours <- ColourPalleteMulti(df, "duet_outcome", "my_grp")
 print(paste0("Colour palette generated for: ", length(colours), " colours"))
 my_title = "Protein stability (DUET)"
 cat("No. of axis colours: ", length(my_axis_colours))
 #******************
 # generate plot: barplot unordered by frequency with axis colours
 #******************
 class(df$lab_bg)
 # define cartesian coord
 my_xlim = length(unique(df$position)); my_xlim
 # axis label size
 my_xals = 18
 my_yals = 18
 # axes text size
 my_xats = 14
 my_yats = 18
 g3 = ggplot(df, aes(factor(position, ordered = T)))
 p3 = g3 + 
  coord_cartesian(xlim = c(1, my_xlim)
                  #, ylim = c(0, 6)
                  , ylim = c(0, max(snp_count))
                  , clip = "off") +
  geom_bar(aes(fill = group), colour = "grey") +
  scale_fill_manual(values = colours
                    , guide = "none") +
  geom_tile(aes(,-0.8, width = 0.95, height = 0.85)
            , fill = df$lab_bg) +
  geom_tile(aes(,-1.2, width = 0.95, height = -0.2)
            , fill = df$lab_bg2) +
  # Here it"s important to specify that your axis goes from 1 to max number of levels
  theme(axis.text.x = element_text(size = my_xats
                                   , angle = 90
                                   , hjust = 1
                                   , vjust = 0.4
                                   , colour = my_axis_colours)
        , axis.text.y = element_text(size = my_yats 
                                     , angle = 0
                                     , hjust = 1
                                     , vjust = 0)
        , axis.title.x = element_text(size = my_xals)
        , axis.title.y = element_text(size = my_yals+2 ) 
        , axis.ticks.x = element_blank()) +
  labs(title = ""
       #title = my_title
       , x = "Position"
       , y = "Frequency")
 p3
 #=================
 # generate plot: AF by position unordered, not coloured
 #=================  
 my_ats = 20 # axis text size
 my_als = 22 # axis label size
 g1 = ggplot(df)
 p1 = g1 + 
  geom_bar(aes(x = factor(position, ordered = T)
               , y = af*100
               #, fill = duet_outcome
               )
           , color = "black"
           , fill = "lightgrey"
           , stat = "identity") +
  theme(axis.text.x = element_blank()
         , axis.text.y = element_text(size = my_ats
                                      , angle = 0
                                      , hjust = 1
                                      , vjust = 0)
         , axis.title.x = element_blank()
         , axis.title.y = element_text(size = my_als) ) +
  labs(title = ""
       #, size = 100
       #, x = "Wild-type position"
       , y = "AF(%)")
 p1      
 #=================
 # generate plot: OR by position unordered
 #=================  
 my_ats = 20 # axis text size
 my_als = 22 # axis label size
 g2 = ggplot(df)
 p2 = g2 + 
  geom_bar(aes(x = factor(position, ordered = T)
               , y = or_mychisq
               #, fill = duet_outcome
               ) 
           , colour = "black"
           , fill = "lightgray"
           , stat = "identity") +
  theme(axis.text.x = element_blank()
         , axis.text.y = element_text(size = my_ats
                                      , angle = 0
                                      , hjust = 1
                                      , vjust = 0)
         , axis.title.x = element_blank()
         , axis.title.y = element_text(size = my_als) ) +
  labs(#title = "OR by position"
       #, x = "Wild-type position"
       y = "OR") 
 p2
 ########################################################################
 #               			end of DUET barplots         			               #
 ########################################################################
 #=======
 # output
 #=======
 ps_plots_combined = "af_or_combined_PS.svg"
 plot_ps_plots_combined  =  paste0(plotdir,"/", ps_plots_combined )
 svg(plot_ps_plots_combined , width = 26, height = 12)
 #theme_set(theme_gray()) # to preserve default theme
 printFile = cowplot::plot_grid(p1, p2, p3
                            , ncol = 1
                            , align = 'v')
 printFile
 dev.off()
--- a/scripts/plotting/resolving_ambiguous_muts.R
+++ b/scripts/plotting/resolving_ambiguous_muts.R
@ -0,0 +1,136 @@
 #!/usr/bin/env Rscript
 #########################################################
 # TASK: To resolve ambiguous muts present in <gene>_metadata.csv
 #(which is one of the outputs from python script)
 # Input csv file: <gene>_metadata.csv
 # Output csv file: <gene>_metadata_clean.csv
 #########################################################
 #=======================================================================
 # working dir and loading libraries
 getwd()
 setwd("~/git/LSHTM_analysis/scripts/plotting/")
 getwd()
 source("dirs.R")
 cat("Directories imported:"
    , "\n===================="
    , "\ndatadir:", datadir
    , "\nindir:", indir
    , "\noutdir:", outdir
    , "\nplotdir:", plotdir)
 cat("Variables imported:"
    , "\n====================="
    , "\ndrug:", drug
    , "\ngene:", gene
    , "\ngene_match:", gene_match
    , "\ndr_muts_col:", dr_muts_col
    , "\nother_muts_col:", other_muts_col
    , "\ndrtype_col:", resistance_col)
 #========================================================
 #===========
 # input
 #===========
 # infile1: gene associated meta data
 gene_metadata_raw = paste0(tolower(gene),  "_metadata_raw.csv")
 infile_gene_metadata_raw = paste0(outdir, "/", gene_metadata_raw)
 cat("Input infile 1:", infile_gene_metadata_raw)
 # infile 2: ambiguous muts
 ambiguous_muts = paste0(tolower(gene),  "_ambiguous_mut_names.csv")
 infile_ambiguous_muts = paste0(outdir, "/", ambiguous_muts)
 cat("Input infile 2:", infile_ambiguous_muts)
 #===========
 # output
 #===========
 # clean gene_metadat with ambiguous muts resolved
 gene_metadata_clean = paste0(tolower(gene), "_metadata.csv")
 outfile_gene_metadata_clean  = paste0(outdir, "/", gene_metadata_clean)
 cat("Output file:", outfile_gene_metadata_clean)
 #%%===============================================================
 ###########################
 # Read file: <gene>_meta data raw.csv
 ###########################
 cat("Reading meta data file:", infile_gene_metadata_raw)
 gene_metadata_raw <- read.csv(infile_gene_metadata_raw
                      , stringsAsFactors = F
                      , header = T)
 cat("Dim:", dim(gene_metadata_raw))
 ###########################
 # Read file: ambiguous muts.csv
 ##########################
 ambiguous_muts = read.csv(infile_ambiguous_muts)
 ambiguous_muts_names = ambiguous_muts$mutation
 common_muts_all = gene_metadata_raw[gene_metadata_raw$mutation%in%ambiguous_muts_names,]
 #==============
 # resolving ambiguous muts
 #===============
 table(gene_metadata_raw$mutation_info)
 count_check = as.data.frame(cbind(table(gene_metadata_raw$mutationinformation, gene_metadata_raw$mutation_info)))
 count_check$checks = ifelse(count_check[[dr_muts_col]]&count_check[[other_muts_col]]>0
                            , "ambi", "pass")
 table(count_check$checks)
 cat(table(count_check$checks)[["ambi"]], "ambiguous muts detected. Correcting these")
 # remove all ambiguous muts from df
 ids = gene_metadata_raw$mutation%in%common_muts_all$mutation; table(ids)
 gene_metadata_raw_unambiguous = gene_metadata_raw[!ids,]
 # sanity checks: should be true
 table(gene_metadata_raw_unambiguous$mutation%in%common_muts_all$mutation)[[1]] == nrow(gene_metadata_raw_unambiguous)
 nrow(gene_metadata_raw_unambiguous) + nrow(common_muts_all) == nrow(gene_metadata_raw)
 # check before resolving ambiguous muts
 table(common_muts_all$mutation_info)
 common_muts_all$mutation_info = as.factor(common_muts_all$mutation_info)
 # resolving ambiguous muts: change other_muts to dr_muts
 common_muts_all$mutation_info[common_muts_all$mutation_info==other_muts_col] <- dr_muts_col
 table(common_muts_all$mutation_info)
 common_muts_all$mutation_info = factor(common_muts_all$mutation_info)
 table(common_muts_all$mutation_info)
 common_muts_all$mutation_info =  as.character(common_muts_all$mutation_info)
 # combining to a clean df
 gene_metadata = rbind(gene_metadata_raw_unambiguous, common_muts_all)
 all(dim(gene_metadata) == dim(gene_metadata))
 table(gene_metadata$mutation_info)
 count_check2 = as.data.frame(cbind(table(gene_metadata$mutationinformation
                                         , gene_metadata$mutation_info)))
 count_check2$checks = ifelse(count_check2[[dr_muts_col]]&count_check2[[other_muts_col]]>0
                            , "ambi", "pass")
 if (table(count_check2$checks) == nrow(count_check2)){
  cat("PASS: ambiguous muts successfully resolved."
      , "\nwriting file")
 }else{
  print("FAIL: ambiguous muts could not be resolved!")
  quit()
 }
 #============
 # writing file
 #=============
 write.csv(gene_metadata, outfile_gene_metadata_clean
          , row.names = FALSE)
 #=====================================================================
 # End of script
 #======================================================================
--- a/scripts/plotting/running_plot_scripts
+++ b/scripts/plotting/running_plot_scripts
@ -1,12 +1,26 @@
 #========
 # plotting
 #========
 FIXME:
 #=======================
 dirs.R
 #=======================
 #dirs.R: imports dir structure
 change this to cmd so you can pass in drug and gene name
 #=======================
 plotting_data.R
 #=======================
 ??? update how to run
 #=======================
 combining_dfs_plotting.R
 #=======================
 ??? update how to run
 #=======================
 mcsm_mean_stability.R
 #=======================
@ -58,6 +72,13 @@ barplots_subcolours_PS.R
 # output plots: 1 svg
    1) barplot_coloured_PS.svg
 #=======================
 lineage_dist_PS.R
 # lineage distribution for all muts and dr muts in one figure
 #=======================
 # input: calls the "combining_dfs_plotting.R"
 # output plots: 1 svg
    1) lineage_dist_combined_PS.svg
 ########################################################################
--- a/scripts/plotting/test.R
+++ b/scripts/plotting/test.R
@ -0,0 +1,96 @@
 setwd("/home/tanu/git/LSHTM_analysis/scripts/plotting")
 source("combining_dfs_plotting.R")
 table(merged_df3$mutation_info)
 # assign foldx
 #ddg<0 = "Stabilising" (-ve)
 table(merged_df3$ddg < 0)
 merged_df3$foldx_outcome = ifelse(merged_df3$ddg < 0, "Stabilising", "Destabilising")
 #===========
 # PS data
 #===========
 dr_muts = merged_df3[merged_df3$mutation_info == "dr_mutations_pyrazinamide",]
 other_muts = merged_df3[merged_df3$mutation_info == "other_mutations_pyrazinamide",]
 par(mfrow = c(1,1))
 par(mfrow = c(2,6))
 # mcsm duet
 boxplot(dr_muts$duet_scaled, other_muts$duet_scaled, main = "DUET"
        #, col = factor(merged_df3$duet_outcome)
        )
 wilcox.test(dr_muts$duet_scaled, other_muts$duet_scaled, paired = F)
 # foldx ddg
 boxplot(dr_muts$ddg, other_muts$ddg, main = "Foldx")
 wilcox.test(dr_muts$ddg, other_muts$ddg, paired = F)
 # rd
 boxplot(dr_muts$rd_values, other_muts$rd_values, main = "RD")
 wilcox.test(dr_muts$rd_values, other_muts$rd_values)
 # kd
 boxplot(dr_muts$kd_values, other_muts$kd_values, main = "KD")
 wilcox.test(dr_muts$kd_values, other_muts$kd_values)
 # asa
 boxplot(dr_muts$asa, other_muts$asa, main = "ASA")
 wilcox.test(dr_muts$asa, other_muts$asa)
 # rsa
 boxplot(dr_muts$rsa, other_muts$rsa, main = "RSA")
 wilcox.test(dr_muts$rsa, other_muts$rsa)
 #===================================================================
 #==========
 # LIG data
 #==========
 dr_muts_lig = merged_df3_lig[merged_df3_lig$mutation_info == "dr_mutations_pyrazinamide",]
 other_muts_lig = merged_df3_lig[merged_df3_lig$mutation_info == "other_mutations_pyrazinamide",]
 # mcsm ligand affinity
 boxplot(dr_muts_lig$duet_scaled, other_muts_lig$duet_scaled, main = "Ligand affinity")
 wilcox.test(dr_muts_lig$duet_scaled, other_muts_lig$duet_scaled, paired = F)
 # rd
 boxplot(dr_muts_lig$rd_values, other_muts_lig$rd_values, main = "RD")
 wilcox.test(dr_muts_lig$rd_values, other_muts_lig$rd_values)
 # kd
 boxplot(dr_muts_lig$kd_values, other_muts_lig$kd_values, main = "KD")
 wilcox.test(dr_muts_lig$kd_values, other_muts_lig$kd_values)
 # asa
 boxplot(dr_muts_lig$asa, other_muts_lig$asa, main = "ASA")
 wilcox.test(dr_muts_lig$asa, other_muts_lig$asa)
 # rsa
 boxplot(dr_muts_lig$rsa, other_muts_lig$rsa, main = "RSA")
 wilcox.test(dr_muts_lig$rsa, other_muts_lig$rsa)
 # checking agreement b/w mcsm and foldx
 cols_to_select = c("mutationinformation"
                   , "mutation_info"
                   , "duet_scaled"
                   , "ddg"
                   , "duet_outcome"
                   , "foldx_outcome")
 merged_df3_short = select(merged_df3, cols_to_select)
 mcsm_foldx = merged_df3_short[which(merged_df3_short$duet_outcome != merged_df3_short$foldx_outcome),]
 mcsm_foldx$sign_comp = ifelse(sign(mcsm_foldx$duet_scaled)==sign(mcsm_foldx$ddg), "PASS", "FAIL")
 table(mcsm_foldx$sign_comp)
 # another way of checking
 merged_df3$sign_comp = ifelse(sign(merged_df3$duet_scaled)==sign(merged_df3$ddg), "PASS", "FAIL")
 table(merged_df3$sign_comp)
 disagreement = table(merged_df3$sign_comp)[2]/nrow(merged_df3)*100
 agreement = 100 - disagreement
 cat("There is", agreement, "% between mcsm and foldx predictions")