From ddefcd7841494ae43440e122c40bcc9cec7a09b4 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Fri, 4 Sep 2020 20:55:55 +0100
Subject: [PATCH 01/27] resolving missing mutation info in combining script

---
 scripts/combining_dfs.py   | 98 +++++++++++++++++++++++++++++++++-----
 scripts/data_extraction.py |  2 -
 scripts/or_kinship_link.py |  4 +-
 3 files changed, 87 insertions(+), 17 deletions(-)

diff --git a/scripts/combining_dfs.py b/scripts/combining_dfs.py
index ba40b46..50ff6ee 100755
--- a/scripts/combining_dfs.py
+++ b/scripts/combining_dfs.py
@@ -54,6 +54,7 @@ os.getcwd()
 # FIXME: local imports
 #from combining import combine_dfs_with_checks
 from combining_FIXME import detect_common_cols
+from reference_dict import oneletter_aa_dict # CHECK DIR STRUC THERE!
 #=======================================================================
 #%% command line args
 arg_parser = argparse.ArgumentParser()
@@ -155,6 +156,8 @@ ncols_m1 = len(mcsm_foldx_dfs.columns)
 
 print('\n\nResult of first merge:', mcsm_foldx_dfs.shape
       , '\n===================================================================')
+mcsm_foldx_dfs[merging_cols_m1].apply(len)
+mcsm_foldx_dfs[merging_cols_m1].apply(len) == len(mcsm_foldx_dfs)
 #%%============================================================================
 print('==================================='
       , '\nSecond merge: dssp + kd'
@@ -183,6 +186,8 @@ ncols_m3 = len(dssp_kd_rd_dfs.columns)
 
 print('\n\nResult of Third merge:', dssp_kd_rd_dfs.shape
       , '\n===================================================================')
+dssp_kd_rd_dfs[merging_cols_m3].apply(len)
+dssp_kd_rd_dfs[merging_cols_m3].apply(len) == len(dssp_kd_rd_dfs)
 #%%============================================================================
 print('======================================='
       , '\nFourth merge: First merge + Third merge'
@@ -203,12 +208,14 @@ else:
 
 print('\nResult of Fourth merge:', combined_df.shape
       , '\n===================================================================')
+combined_df[merging_cols_m4].apply(len)
+combined_df[merging_cols_m4].apply(len) == len(combined_df)
 #%%============================================================================
 
 # OR merges: TEDIOUSSSS!!!!
 
 
-#%%RRRR
+#%%
 print('==================================='
       , '\nFifth merge: afor_df + afor_kin_df'
       , '\n===================================')
@@ -220,8 +227,6 @@ afor_df = pd.read_csv(infile_afor, sep = ',')
 afor_kin_df = pd.read_csv(infile_afor_kin, sep = ',')
 afor_kin_df.columns = afor_kin_df.columns.str.lower()
 
-
-
 merging_cols_m5 = detect_common_cols(afor_df, afor_kin_df)
 
 print('Dim of afor_df:', afor_df.shape
@@ -244,7 +249,7 @@ common_muts = len(afor_df[afor_df['mutation'].isin(afor_kin_df['mutation'])])
 extra_muts_myor =   afor_kin_df.shape[0] - common_muts
 
 print('=========================================='
-      , '\nmy or calcs', extra_muts_myor, 'extra mutation\n'
+      , '\nmy or calcs has', extra_muts_myor, 'extra mutations'
       , '\n==========================================')
 
 print('Expected cals for merging with outer_join...')
@@ -261,12 +266,13 @@ else:
           , '\nCheck expected rows and cols calculation and join type')
 
 print('Dim of merged ors_df:', ors_df.shape)
+
+ors_df[merging_cols_m5].apply(len)
+ors_df[merging_cols_m5].apply(len) == len(ors_df)
 #%%============================================================================
 # formatting ors_df
-
 ors_df.columns
 
-
 # Dropping unncessary columns: already removed in ealier preprocessing
 #cols_to_drop = ['reference_allele', 'alternate_allele', 'symbol' ]
 cols_to_drop = ['n_miss']
@@ -324,7 +330,7 @@ column_order = ['mutation'
                 #, 'n_miss'
                 ]
 
-if ( (len(column_order) == ors_df.shape[1]) and (DataFrame(column_order).isin(ors_df.columns).all().all()):
+if ( (len(column_order) == ors_df.shape[1]) and (DataFrame(column_order).isin(ors_df.columns).all().all())):
     print('PASS: Column order generated for all:', len(column_order), 'columns'
           , '\nColumn names match, safe to reorder columns'
           , '\nApplying column order to df...' )
@@ -357,10 +363,35 @@ print('Checking mutations in the two dfs:'
 
 #print('\nNo. of common muts:', np.intersect1d(combined_df['mutationinformation'], ors_df_ordered['mutationinformation']) )
 
-#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 combined_df_all = pd.merge(combined_df, ors_df, on = merging_cols_m6, how = l_join)
 combined_df_all.shape
 
+# populate mut_info_f1
+combined_df_all['mut_info_f1'].isna().sum()
+combined_df_all['mut_info_f1'] = combined_df_all['position'].astype(str) + combined_df_all['wild_type'] + '>' + combined_df_all['position'].astype(str) + combined_df_all['mutant_type']
+combined_df_all['mut_info_f1'].isna().sum()  
+
+# populate mut_type
+combined_df_all['mut_type'].isna().sum()
+#mut_type_word = combined_df_all['mut_type'].value_counts()
+mut_type_word  = 'missense' # FIXME, should be derived
+combined_df_all['mut_type'].fillna(mut_type_word, inplace = True)
+combined_df_all['mut_type'].isna().sum()
+
+# populate gene_id
+combined_df_all['gene_id'].isna().sum()
+#gene_id_word = combined_df_all['gene_id'].value_counts()
+gene_id_word  = 'Rv2043c' # FIXME, should be derived
+combined_df_all['gene_id'].fillna(gene_id_word, inplace = True)
+combined_df_all['gene_id'].isna().sum()
+
+# populate gene_name
+combined_df_all['gene_name'].isna().sum()
+combined_df_all['gene_name'].value_counts()
+combined_df_all['gene_name'].fillna(gene, inplace = True)
+combined_df_all['gene_name'].isna().sum()
+
+
 # FIXME: DIM
 # only with left join!
 outdf_expected_rows = len(combined_df)
@@ -383,11 +414,52 @@ else:
              , '\nmuts in df2 but NOT in df1:'
              , ors_df['mutationinformation'].isin(combined_df['mutationinformation']).sum())
     sys.exit()
-#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-# nan in mutation col
-# FIXME: should get fixmed with JP's resolved dataset!?
-combined_df_all['mutation'].isna().sum()
-baz = combined_df_all[combined_df_all['mutation'].isna()]
+    
+
+#%% IMPORTANT: check if mutation related info is all populated after this merge  
+# FIXME: should get fixed with JP's resolved dataset!?
+check_nan = combined_df_all.isna().sum(axis = 0)
+# relevant mut cols
+check_mut_cols = merging_cols_m5 + merging_cols_m6
+
+count_na_mut_cols = combined_df_all[check_mut_cols].isna().sum().reset_index().rename(columns = {'index': 'col_name', 0: 'na_count'})
+
+if (count_na_mut_cols['na_count'].sum() > 0).any():
+    # FIXME: static override, generate 'mutation' from variable
+    na_muts_n = combined_df_all['mutation'].isna().sum() 
+    baz = combined_df_all[combined_df_all['mutation'].isna()]
+    baz = baz[check_mut_cols]
+    print(na_muts_n, 'mutations have missing \'mutation\' info.'
+          , '\nFetching these from reference dict...')
+    
+lookup_dict = dict()
+for k, v in oneletter_aa_dict.items():
+    lookup_dict[k] = v['three_letter_code_lower']
+    print(lookup_dict)
+    wt_3let = combined_df_all['wild_type'].map(lookup_dict).str.capitalize()
+    #print(wt_3let)
+    pos = combined_df_all['position'].astype(str)
+    #print(pos)
+    mt_3let = combined_df_all['mutant_type'].map(lookup_dict).str.capitalize()
+    #print(mt_3let)
+    baz['mutation'] = 'pnca_p.' + wt_3let + pos + mt_3let
+    print(combined_df_all['mutation'])    
+
+# populate mut_info_f2
+combined_df_all['mut_info_f2'] = combined_df_all['mutation'].str.replace(gene_match.lower(), 'p.', regex = True)
+
+#%% merge
+#merging_cols_m7 = detect_common_cols(combined_df_all, baz)
+
+baz2 = baz[['mutationinformation', 'mut_info_f2']]
+baz2 = baz2.drop_duplicates()
+merging_cols_m7 = detect_common_cols(combined_df_all, baz2)
+
+combined_df_all2 = pd.merge(combined_df_all, baz2
+                           #, on = merging_cols_m7
+                            , on = 'mutationinformation'
+                            , how = o_join)
+
 #%%============================================================================
 output_cols = combined_df_all.columns
 print('Output cols:', output_cols)
diff --git a/scripts/data_extraction.py b/scripts/data_extraction.py
index 2f3531a..3f72b1b 100755
--- a/scripts/data_extraction.py
+++ b/scripts/data_extraction.py
@@ -45,8 +45,6 @@ Created on Tue Aug  6 12:56:03 2019
 #5. chain
 #6. wild_pos
 #7. wild_chain_pos
-
-
 #=======================================================================
 #%% load libraries
 import os, sys
diff --git a/scripts/or_kinship_link.py b/scripts/or_kinship_link.py
index b1a00ce..17451fb 100755
--- a/scripts/or_kinship_link.py
+++ b/scripts/or_kinship_link.py
@@ -104,7 +104,7 @@ or_df.columns
 
 #%% snp_info file: master and gene specific ones
 # gene info
-info_df2  = pd.read_csv(gene_info, sep = '\t', header = 0) #447, 10 
+info_df2  = pd.read_csv(gene_info, sep = '\t', header = 0) #447, 11
 #info_df2  = pd.read_csv(gene_info, sep = ',', header = 0) #447 10   
 mis_mut_cover = (info_df2['chromosome_number'].nunique()/info_df2['chromosome_number'].count()) * 100
 print('*****RESULT*****'
@@ -212,7 +212,7 @@ else:
 
 #PENDING Jody's reply 
 # !!!!!!!! 
-# drop nan from dfm2_mis as these are not useful
+# drop nan from dfm2_mis as these are not useful and JP confirmed the same
 print('Dropping NAs before further processing...')
 dfm2_mis = dfm2[dfm2['mut_type'].notnull()]
 # !!!!!!!!

From 645868ea272e6cbce201390ea7d29f5c16504c8b Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Fri, 4 Sep 2020 21:04:18 +0100
Subject: [PATCH 02/27] adding missing mutation col in combining_dfs

---
 scripts/combining_dfs.py | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/scripts/combining_dfs.py b/scripts/combining_dfs.py
index 50ff6ee..622e8e5 100755
--- a/scripts/combining_dfs.py
+++ b/scripts/combining_dfs.py
@@ -427,42 +427,37 @@ count_na_mut_cols = combined_df_all[check_mut_cols].isna().sum().reset_index().r
 if (count_na_mut_cols['na_count'].sum() > 0).any():
     # FIXME: static override, generate 'mutation' from variable
     na_muts_n = combined_df_all['mutation'].isna().sum() 
-    baz = combined_df_all[combined_df_all['mutation'].isna()]
-    baz = baz[check_mut_cols]
+    #baz = combined_df_all[combined_df_all['mutation'].isna()]
     print(na_muts_n, 'mutations have missing \'mutation\' info.'
           , '\nFetching these from reference dict...')
+else:
+    print('No missing \'mutation\' has been detected!')
+    
     
 lookup_dict = dict()
 for k, v in oneletter_aa_dict.items():
     lookup_dict[k] = v['three_letter_code_lower']
     print(lookup_dict)
-    wt_3let = combined_df_all['wild_type'].map(lookup_dict).str.capitalize()
+    wt_3let = combined_df_all['wild_type'].map(lookup_dict)
     #print(wt_3let)
     pos = combined_df_all['position'].astype(str)
     #print(pos)
-    mt_3let = combined_df_all['mutant_type'].map(lookup_dict).str.capitalize()
+    mt_3let = combined_df_all['mutant_type'].map(lookup_dict)
     #print(mt_3let)
-    baz['mutation'] = 'pnca_p.' + wt_3let + pos + mt_3let
+    # override the 'mutation' column
+    combined_df_all['mutation'] = 'pnca_p.' + wt_3let + pos + mt_3let
     print(combined_df_all['mutation'])    
 
 # populate mut_info_f2
 combined_df_all['mut_info_f2'] = combined_df_all['mutation'].str.replace(gene_match.lower(), 'p.', regex = True)
 
-#%% merge
-#merging_cols_m7 = detect_common_cols(combined_df_all, baz)
-
-baz2 = baz[['mutationinformation', 'mut_info_f2']]
-baz2 = baz2.drop_duplicates()
-merging_cols_m7 = detect_common_cols(combined_df_all, baz2)
-
-combined_df_all2 = pd.merge(combined_df_all, baz2
-                           #, on = merging_cols_m7
-                            , on = 'mutationinformation'
-                            , how = o_join)
+#%% check
+#cols_check = check_mut_cols + ['mut_info_f1', 'mut_info_f2']
+#foo = combined_df_all[cols_check]
 
 #%%============================================================================
 output_cols = combined_df_all.columns
-print('Output cols:', output_cols)
+#print('Output cols:', output_cols)
 
 #%%============================================================================ 
 # write csv

From dd1158a66c5c92f560b6cb05a3f2f8e0d1888dc3 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Fri, 4 Sep 2020 22:40:49 +0100
Subject: [PATCH 03/27] script to plot lineage dist plots

---
 scripts/plotting/lineage_dist_PS.R | 232 +++++++++++++++++++++++++++++
 1 file changed, 232 insertions(+)
 create mode 100644 scripts/plotting/lineage_dist_PS.R

diff --git a/scripts/plotting/lineage_dist_PS.R b/scripts/plotting/lineage_dist_PS.R
new file mode 100644
index 0000000..432af84
--- /dev/null
+++ b/scripts/plotting/lineage_dist_PS.R
@@ -0,0 +1,232 @@
+#!/usr/bin/env Rscript       
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/plotting/")
+getwd()
+
+########################################################################
+# 				Installing and loading required packages 			   #
+########################################################################
+
+source("../Header_TT.R")
+#source("../barplot_colour_function.R")
+#require(data.table)
+
+########################################################################
+#		 Read file: call script for combining df for PS			   	   #
+########################################################################
+
+source("combining_two_df.R")
+
+#---------------------- PAY ATTENTION
+# the above changes the working dir
+#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
+#---------------------- PAY ATTENTION
+
+#==========================
+# This will return:
+
+# df with NA for <drug>
+# merged_df2
+# merged_df3
+
+# df without NA for <drug>
+# merged_df2_comp
+# merged_df3_comp
+#===========================
+
+###########################
+# Data for plots
+# you need merged_df2 or merged_df2_comp
+# since this is one-many relationship 
+# i.e the same SNP can belong to multiple lineages
+# using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available, hence use df with NA
+###########################
+
+# uncomment as necessary
+#%%%%%%%%%%%%%%%%%%%%%%%%
+# REASSIGNMENT
+my_df  = merged_df2
+#my_df  = merged_df2_comp
+#%%%%%%%%%%%%%%%%%%%%%%%%
+
+# delete variables not required
+rm(my_df_u, merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+is.factor(my_df$lineage)
+my_df$lineage = as.factor(my_df$lineage)
+is.factor(my_df$lineage)
+
+table(my_df$mutation_info); str(my_df$mutation_info)
+
+# subset df with dr muts only
+my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide") 
+table(my_df_dr$mutation_info)
+
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#==========================
+# Run two times: 
+# uncomment as necessary
+# 1) for all muts
+# 2) for dr_muts
+#===========================
+
+#%%%%%%%%%%%%%%%%%%%%%%%%
+# REASSIGNMENT
+
+#================
+# for ALL muts
+#================
+plot_df = my_df  
+my_plot_name = 'lineage_dist_PS.svg'
+
+plot_lineage_duet  =  paste0(plotdir,"/", my_plot_name)
+
+#my_plot_name = 'lineage_dist_PS_comp.svg'
+
+#================
+# for dr muts ONLY
+#================
+plot_df = my_df_dr 
+#my_plot_name = 'lineage_dist_dr_PS.svg'
+#my_plot_name = 'lineage_dist_dr_PS_comp.svg'
+my_plot_name = 'lineage_dist_drug_muts_PS.svg'
+
+plot_lineage_duet  =  paste0(plotdir,"/", my_plot_name)
+
+#%%%%%%%%%%%%%%%%%%%%%%%%
+
+#==========================
+# Plot: Lineage Distribution
+# x = mcsm_values, y = dist
+# fill = stability
+#============================
+
+#===================
+# Data for plots
+#===================
+table(plot_df$lineage); str(plot_df$lineage)
+
+# subset only lineages1-4
+sel_lineages = c("lineage1"
+                 , "lineage2"
+                 , "lineage3"
+                 , "lineage4")
+                 #, "lineage5"
+                 #, "lineage6"
+                 #, "lineage7")
+
+# uncomment as necessary
+df_lin = subset(plot_df, subset = lineage %in% sel_lineages )
+table(df_lin$lineage)
+
+# refactor
+df_lin$lineage = factor(df_lin$lineage)
+
+sum(table(df_lin$lineage)) #{RESULT: Total number of samples for lineage}
+
+table(df_lin$lineage)#{RESULT: No of samples within lineage}
+
+length(unique(df_lin$mutationinformation))#{Result: No. of unique mutations the 4 lineages contribute to}
+
+length(df_lin$mutationinformation)
+# sanity checks
+# FIXME
+r1 = 2:7 # when merged_df2 used: because there is missing lineages 
+if(sum(table(plot_df$lineage)[r1]) == nrow(df_lin)) {
+  print ("sanity check passed: numbers match")
+} else{
+  print("Error!: check your numbers")
+} 
+
+u2 = unique(plot_df$mutationinformation)
+u = unique(df_lin$mutationinformation)
+check = u2[!u2%in%u]; print(check) #{Muts not present within selected lineages}
+
+#%%%%%%%%%%%%%%%%%%%%%%%%%
+# REASSIGNMENT
+df <- df_lin
+#%%%%%%%%%%%%%%%%%%%%%%%%%
+
+rm(df_lin)
+
+#******************
+# generate distribution plot of lineages
+#******************
+# 2 : ggridges (good!)
+my_ats = 15 # axis text size
+my_als = 20 # axis label size
+
+my_labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4'
+              #, 'Lineage 5', 'Lineage 6', 'Lineage 7'
+              )
+names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4'
+                    # , 'lineage5', 'lineage6', 'lineage7'
+                     )
+# check plot name
+my_plot_name
+
+# output svg
+svg(plot_lineage_duet)
+printFile = ggplot(df, aes(x = duet_scaled
+                            , y = duet_outcome))+
+  
+  #printFile=geom_density_ridges_gradient(
+  geom_density_ridges_gradient(aes(fill = ..x..)
+                                , scale = 3
+                                , size = 0.3 ) +
+  facet_wrap( ~lineage
+              , scales = "free"
+#             , switch = 'x'
+              , labeller = labeller(lineage = my_labels) ) +
+  coord_cartesian( xlim = c(-1, 1)
+#                   , ylim = c(0, 6)
+#                   , clip = "off" 
+) +
+  scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4")
+                        , name = "DUET" ) + 
+  theme(axis.text.x = element_text(size = my_ats
+                                     , angle = 90
+                                     , hjust = 1
+                                     , vjust = 0.4)
+#                  , axis.text.y = element_text(size = my_ats
+#                                                , angle = 0
+#                                                , hjust = 1
+#                                                , vjust = 0)
+         , axis.text.y = element_blank()
+         , axis.title.x = element_blank()
+         , axis.title.y = element_blank()
+         , axis.ticks.y = element_blank()
+         , plot.title = element_blank()
+         , strip.text = element_text(size = my_als)
+         , legend.text = element_text(size = 10)
+         , legend.title = element_text(size = my_als)
+#                  , legend.position = c(0.3, 0.8)
+#                  , legend.key.height = unit(1, 'mm')
+  ) 
+
+print(printFile)
+dev.off()
+
+#=!=!=!=!=!=!=!
+# COMMENT: Not much differences in the distributions
+# when using merged_df2 or merged_df2_comp.
+# Also, the lineage differences disappear when looking at all muts
+# The pattern we are interested in is possibly only for dr_mutations
+#=!=!=!=!=!=!=!
+#===================================================
+
+# COMPARING DISTRIBUTIONS: KS test 
+# run: "../KS_test_PS.R"
+
+
+

From 7460c7c97fb908e432b1ea4f022e03f3764309f4 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Fri, 4 Sep 2020 22:43:30 +0100
Subject: [PATCH 04/27] updated combining_two_df.R for plots

---
 scripts/plotting/combining_two_df.R | 421 ++++++++++++++++++++++++++++
 1 file changed, 421 insertions(+)
 create mode 100644 scripts/plotting/combining_two_df.R

diff --git a/scripts/plotting/combining_two_df.R b/scripts/plotting/combining_two_df.R
new file mode 100644
index 0000000..7b09f03
--- /dev/null
+++ b/scripts/plotting/combining_two_df.R
@@ -0,0 +1,421 @@
+#!/usr/bin/env Rscript       
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/plotting/")
+getwd()
+
+#########################################################
+# TASK: To combine struct params and meta data for plotting
+# Input csv files:
+# 1) <gene>_all_params.csv
+# 2) <gene>_meta_data.csv
+
+# Output: 
+# 1) muts with opposite effects on stability
+# 2) large combined df including NAs for AF, OR,etc
+# 		Dim: same no. of rows as gene associated meta_data_with_AFandOR
+# 3) small combined df including NAs for AF, OR, etc.
+# 		Dim: same as mcsm data
+# 4) large combined df excluding NAs 
+# 		Dim: dim(#1) - no. of NAs(AF|OR) + 1
+# 5) small combined df excluding NAs
+# 		Dim: dim(#2) - no. of unique NAs - 1
+# This script is sourced from other .R scripts for plotting
+#########################################################
+
+##########################################################
+# 				Installing and loading required packages 			 
+##########################################################
+#source("Header_TT.R")
+require(data.table)
+require(arsenal)
+require(compare)
+library(tidyverse)
+
+source("plotting_data.R")
+
+# should return the following dfs, directories and variables
+# my_df
+# my_df_u
+# my_df_u_lig
+# dup_muts
+
+cat(paste0("Directories imported:"
+           , "\ndatadir:", datadir
+           , "\nindir:", indir
+           , "\noutdir:", outdir
+           , "\nplotdir:", plotdir))
+
+cat(paste0("Variables imported:"
+           , "\ndrug:", drug
+           , "\ngene:", gene
+           , "\ngene_match:", gene_match
+           , "\nLength of upos:", length(upos)
+           , "\nAngstrom symbol:", angstroms_symbol))       
+
+# clear excess variable
+rm(my_df, upos, dup_muts, my_df_u_lig)
+#========================================================
+
+#========================================================
+#%% variable assignment: input and output paths & filenames
+drug = "pyrazinamide"
+gene = "pncA"
+gene_match = paste0(gene,"_p.")
+cat(gene_match)
+
+#=============
+# directories
+#=============
+datadir = paste0("~/git/Data")
+indir = paste0(datadir, "/", drug, "/input")
+outdir = paste0("~/git/Data", "/", drug, "/output")
+plotdir = paste0("~/git/Data", "/", drug, "/output/plots")
+#===========
+# input
+#===========
+#in_file1: output of plotting_data.R
+
+
+# infile 2: gene associated meta data
+#in_filename_gene_metadata = paste0(tolower(gene),  "_meta_data_with_AFandOR.csv")
+in_filename_gene_metadata = paste0(tolower(gene),  "_metadata.csv")
+infile_gene_metadata = paste0(outdir, "/", in_filename_gene_metadata)
+cat(paste0("Input infile 2:", infile_gene_metadata))
+
+#===========
+# output
+#===========
+# mutations with opposite effects
+out_filename_opp_muts = paste0(tolower(gene), "_muts_opp_effects.csv")
+outfile_opp_muts = paste0(outdir, "/", out_filename_opp_muts)
+
+
+#%%===============================================================
+table(my_df_u$duet_outcome); sum(table(my_df_u$duet_outcome) )
+
+# spelling Correction 1: DUET incase American spelling needed!
+#my_df_u$duet_outcome[my_df_u$duet_outcome=="Stabilising"] <- "Stabilizing"
+#my_df_u$duet_outcome[my_df_u$duet_outcome=="Destabilising"] <- "Destabilizing"
+
+
+# spelling Correction 2: Ligand incase American spelling needed!
+table(my_df_u$ligand_outcome); sum(table(my_df_u$ligand_outcome) )
+#my_df_u$ligand_outcome[my_df_u$ligand_outcome=="Stabilising"] <- "Stabilizing"
+#my_df_u$ligand_outcome[my_df_u$ligand_outcome=="Destabilising"] <- "Destabilizing"
+
+
+# muts with opposing effects on protomer and ligand stability
+table(my_df_u$duet_outcome != my_df_u$ligand_outcome)
+changes = my_df_u[which(my_df_u$duet_outcome != my_df_u$ligand_outcome),]
+
+# sanity check: redundant, but uber cautious!
+dl_i = which(my_df_u$duet_outcome != my_df_u$ligand_outcome)
+ld_i = which(my_df_u$ligand_outcome != my_df_u$duet_outcome)
+
+cat("Identifying muts with opposite stability effects")
+if(nrow(changes) == (table(my_df_u$duet_outcome != my_df_u$ligand_outcome)[[2]]) & identical(dl_i,ld_i)) {
+  cat("PASS: muts with opposite effects on stability and affinity correctly identified"
+        , "\nNo. of such muts: ", nrow(changes))
+}else {
+  cat("FAIL: unsuccessful in extracting muts with changed stability effects")
+}
+
+#***************************
+# write file: changed muts
+write.csv(changes, outfile_opp_muts)
+
+cat("Finished writing file for muts with opp effects:"
+    , "\nFilename: ", outfile_opp_muts
+    , "\nDim:", dim(changes))
+
+# clear variables
+rm(out_filename_opp_muts, outfile_opp_muts)
+rm(changes, dl_i, ld_i)
+
+# count na in each column
+na_count = sapply(my_df_u, function(y) sum(length(which(is.na(y))))); na_count
+df_ncols = ncol(my_df_u)
+
+###########################
+# 2: Read file: <gene>_meta data.csv
+###########################
+cat("Reading meta data file:", infile_gene_metadata)
+
+gene_metadata <- read.csv(infile_gene_metadata
+                      , stringsAsFactors = F
+                      , header = T)
+cat("Dim:", dim(gene_metadata))
+
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+# FIXME: remove
+# counting NAs in AF, OR cols: 
+if (identical(sum(is.na(my_df_u$or_mychisq))
+              , sum(is.na(my_df_u$pval_fisher))
+              , sum(is.na(my_df_u$af)))){
+  cat("\nPASS: NA count match for OR, pvalue and AF\n")
+  na_count = sum(is.na(my_df_u$af))
+  cat("\nNo. of NAs: ", sum(is.na(my_df_u$or_mychisq)))
+} else{
+  cat("\nFAIL: NA count mismatch"
+      , "\nNA in OR: ", sum(is.na(my_df_u$or_mychisq))
+      , "\nNA in pvalue: ", sum(is.na(my_df_u$pval_fisher))
+      , "\nNA in AF:", sum(is.na(my_df_u$af)))
+}
+
+
+if (identical(sum(is.na(my_df_u$or_kin))
+              , sum(is.na(my_df_u$pwald_kin))
+              , sum(is.na(my_df_u$af_kin)))){
+  cat("\nPASS: NA count match for OR, pvalue and AF\n from Kinship matrix calculations")
+  na_count = sum(is.na(my_df_u$af_kin))
+  cat("\nNo. of NAs: ", sum(is.na(my_df_u$or_kin)))
+} else{
+  cat("\nFAIL: NA count mismatch"
+      , "\nNA in OR: ", sum(is.na(my_df_u$or_kin))
+      , "\nNA in pvalue: ", sum(is.na(my_df_u$pwald_kin))
+      , "\nNA in AF:", sum(is.na(my_df_u$af_kin)))
+}
+
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+# clear variables
+rm(in_filename_gene_metadata, infile_gene_metadata)
+
+str(gene_metadata)
+
+# sort by position (same as my_df)
+# earlier it was mutationinformation
+#head(gene_metadata$mutationinformation)
+#gene_metadata = gene_metadata[order(gene_metadata$mutationinformation),]
+##head(gene_metadata$mutationinformation)
+
+head(gene_metadata$position)
+gene_metadata = gene_metadata[order(gene_metadata$position),]
+head(gene_metadata$position)
+
+###########################
+# Merge 1: two dfs with NA
+# merged_df2
+###########################
+head(my_df_u$mutationinformation)
+head(gene_metadata$mutationinformation)
+
+# Find common columns b/w two df
+# FIXME: mutation has empty cell for some muts
+merging_cols = intersect(colnames(my_df_u), colnames(gene_metadata))
+
+cat(paste0("Merging dfs with NAs: big df (1-many relationship b/w id & mut)"
+           , "\nNo. of merging cols:", length(merging_cols)
+           , "\nMerging columns identified:"))
+print(merging_cols)
+
+# important checks!
+table(nchar(my_df_u$mutationinformation))
+table(nchar(my_df_u$wild_type))
+table(nchar(my_df_u$mutant_type))
+table(nchar(my_df_u$position))
+
+#=============
+# merged_df2: gene_metadata + my_df
+#==============
+# all.y  because x might contain non-structural positions!
+merged_df2 = merge(x = gene_metadata
+                  , y = my_df_u
+                  , by = merging_cols
+                  , all.y = T)
+
+cat("Dim of merged_df2: ", dim(merged_df2))
+head(merged_df2$position)
+
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+# FIXME: count how many unique muts have entries in meta data
+# should PASS 
+cat("Checking nrows in merged_df2")
+if(nrow(gene_metadata) == nrow(merged_df2)){
+  cat("PASS: nrow(merged_df2) = nrow (gene associated gene_metadata)"
+      ,"\nExpected no. of rows: ",nrow(gene_metadata) 
+      ,"\nGot no. of rows: ", nrow(merged_df2))
+} else{
+  cat("FAIL: nrow(merged_df2)!= nrow(gene associated gene_metadata)"
+      , "\nExpected no. of rows after merge: ", nrow(gene_metadata)
+      , "\nGot no. of rows: ", nrow(merged_df2)
+      , "\nFinding discrepancy")
+  merged_muts_u = unique(merged_df2$mutationinformation)
+  meta_muts_u = unique(gene_metadata$mutationinformation)
+    # find the index where it differs
+  unique(meta_muts_u[! meta_muts_u %in% merged_muts_u])
+}
+
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+# sort by position
+head(merged_df2$position)
+merged_df2 = merged_df2[order(merged_df2$position),]
+head(merged_df2$position)
+
+merged_df2v3 = merge(x = gene_metadata
+                   , y = my_df_u
+                   , by = merging_cols
+                   , all = T) 
+
+merged_df2v2 = merge(x = gene_metadata
+                     , y = my_df_u
+                     , by = merging_cols
+                     , all.x = T) 
+#!=!=!=!=!=!=!=!
+#identical(merged_df2, merged_df2v2)
+
+nrow(merged_df2[merged_df2$position==186,])
+#!=!=!=!=!=!=!=!
+
+# should  be False
+identical(merged_df2, merged_df2v2)
+table(merged_df2$position%in%merged_df2v2$position)
+
+#!!!!!!!!!!! check why these differ
+
+#########
+# merge 3b (merged_df3):remove duplicated mutations
+cat("Merging dfs without NAs: small df (removing muts with no AF|OR associated)"
+    ,"\nCannot trust lineage info from this"
+    ,"\nlinking col: mutationinforamtion"
+    ,"\nfilename: merged_df3")
+
+#==#=#=#=#=#=#
+# Cannot trust lineage, country from this df as the same mutation
+# can have many different lineages
+# but this should be good for the numerical corr plots
+#=#=#=#=#=#=#=
+merged_df3 = merged_df2[!duplicated(merged_df2$mutationinformation),] 
+head(merged_df3$position); tail(merged_df3$position) # should be sorted
+
+# sanity check
+cat("Checking nrows in merged_df3")
+if(nrow(my_df_u) == nrow(merged_df3)){
+  cat("PASS: No. of rows match with my_df"
+      ,"\nExpected no. of rows: ", nrow(my_df_u)
+      ,"\nGot no. of rows: ", nrow(merged_df3))
+} else {
+  cat("FAIL: No. of rows mismatch"
+      , "\nNo. of rows my_df: ", nrow(my_df_u)
+      , "\nNo. of rows merged_df3: ", nrow(merged_df3))
+  quit()
+}
+
+# counting NAs in AF, OR cols in merged_df3
+# this is because mcsm has no AF, OR cols,
+# so you cannot count NAs
+if (identical(sum(is.na(merged_df3$or_kin))
+              , sum(is.na(merged_df3$pwald_kin))
+              , sum(is.na(merged_df3$af_kin)))){
+  cat("PASS: NA count match for OR, pvalue and AF\n")
+  na_count_df3 = sum(is.na(merged_df3$af_kin))
+  cat("No. of NAs: ", sum(is.na(merged_df3$or_kin)))
+} else{
+  cat("FAIL: NA count mismatch"
+      , "\nNA in OR: ", sum(is.na(merged_df3$or_kin))
+      , "\nNA in pvalue: ", sum(is.na(merged_df3$pwald_kin))
+      , "\nNA in AF:", sum(is.na(merged_df3$af_kin)))
+}
+
+# check if the same or and afs are missing for 
+if ( identical( which(is.na(merged_df2$or_mychisq)), which(is.na(merged_df2$or_kin)))
+  && identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin)))
+  && identical( which(is.na(merged_df2$pval_fisher)), which(is.na(merged_df2$pwald_kin))) ){
+  cat('PASS: Indices match for mychisq and kin ors missing values')
+} else{
+  cat('Index mismatch: mychisq and kin ors missing indices match')
+  quit()
+}
+
+###########################
+# 4: merging two dfs: without NA
+###########################
+#########
+# merge 4a (merged_df2_comp): same as merge 1 but excluding NA
+#########
+cat("Merging dfs without any NAs: big df (1-many relationship b/w id & mut)"
+    ,"\nlinking col: Mutationinforamtion"
+    ,"\nfilename: merged_df2_comp")
+
+if ( identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin))) ){
+  print('mychisq and kin ors missing indices match. Procedding with omitting NAs')
+  na_count_df2 = sum(is.na(merged_df2$af))
+  merged_df2_comp = merged_df2[!is.na(merged_df2$af),] 
+  # sanity check: no +-1 gymnastics
+  cat("Checking nrows in merged_df2_comp")
+  if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count_df2)){
+    cat("\nPASS: No. of rows match"
+        ,"\nDim of merged_df2_comp: "
+        ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2
+        , "\nNo. of rows: ", nrow(merged_df2_comp)
+        , "\nNo. of cols: ", ncol(merged_df2_comp))
+  }else{
+    cat("FAIL: No. of rows mismatch"
+        ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2
+        ,"\nGot no. of rows: ", nrow(merged_df2_comp))
+  }
+}else{
+  print('Index mismatch for mychisq and kin ors. Aborting NA ommission')
+}
+
+#########
+# merge 4b (merged_df3_comp): remove duplicate mutation information
+#########
+if ( identical( which(is.na(merged_df3$af)), which(is.na(merged_df3$af_kin))) ){
+  print('mychisq and kin ors missing indices match. Procedding with omitting NAs')
+  na_count_df3 = sum(is.na(merged_df3$af))
+  #merged_df3_comp = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] # a way
+  merged_df3_comp = merged_df3[!is.na(merged_df3$af),] # another way
+  cat("Checking nrows in merged_df3_comp")
+  if(nrow(merged_df3_comp) == (nrow(merged_df3) - na_count_df3)){
+    cat("\nPASS: No. of rows match"
+        ,"\nDim of merged_df3_comp: "
+        ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3
+        , "\nNo. of rows: ", nrow(merged_df3_comp)
+        , "\nNo. of cols: ", ncol(merged_df3_comp))
+    }else{
+    cat("FAIL: No. of rows mismatch"
+        ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3
+        ,"\nGot no. of rows: ", nrow(merged_df3_comp))
+   }
+} else{
+  print('Index mismatch for mychisq and kin ors. Aborting NA ommission')
+} 
+  
+# alternate way of deriving merged_df3_comp
+foo = merged_df3[!is.na(merged_df3$af),]
+bar = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),]
+# compare dfs: foo and merged_df3_com
+all.equal(foo, bar)
+#summary(comparedf(foo, bar))
+
+#=============== end of combining df
+#==============================================================
+#################
+# OPTIONAL: write ALL 4 output files 
+#################
+#outvars = c("merged_df2"
+#             , "merged_df3"
+#             , "merged_df2_comp"
+#             , "merged_df3_comp")
+
+#cat("Writing output files: "
+#    , "\nPath:", outdir)
+
+#for (i in outvars){
+#  out_filename = paste0(i, ".csv")
+#  outfile = paste0(outdir, "/", out_filename)
+#  cat("Writing output file:"
+#      ,"\nFilename: ", out_filename,"\n")
+#  write.csv(get(i), outfile, row.names = FALSE)
+#  cat("Finished writing: ", outfile
+#      , "\nNo. of rows: ", nrow(get(i))
+#      , "\nNo. of cols: ", ncol(get(i)), "\n")
+#}
+
+#*************************
+# clear variables
+rm(foo, bar, gene_metadata
+   , merged_df2v2, merged_df2v3)
+
+#============================= end of script
+

From db87f98d32a49886b77e48ec57136bfbbe0c61d6 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Fri, 4 Sep 2020 22:46:07 +0100
Subject: [PATCH 05/27] updated giignore

---
 .gitignore | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.gitignore b/.gitignore
index 6e8243e..426161a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,11 @@
 *.pyc
 __pycache__
 */__pycache__
+mcsm_analysis_fixme
+plotting_test
+scratch
+scripts_old
+foldx/test/
+examples
+del
+

From 2ef767f046584c4937df89548882d0a089a5f2cb Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Mon, 7 Sep 2020 11:29:28 +0100
Subject: [PATCH 06/27] lineage barplot script

---
 scripts/plotting/combining_two_df.R      |   1 +
 scripts/plotting/lineage_basic_barplot.R | 218 +++++++++++++++++++++++
 2 files changed, 219 insertions(+)
 create mode 100644 scripts/plotting/lineage_basic_barplot.R

diff --git a/scripts/plotting/combining_two_df.R b/scripts/plotting/combining_two_df.R
index 7b09f03..5df2525 100644
--- a/scripts/plotting/combining_two_df.R
+++ b/scripts/plotting/combining_two_df.R
@@ -415,6 +415,7 @@ all.equal(foo, bar)
 #*************************
 # clear variables
 rm(foo, bar, gene_metadata
+   , in_filename_params, infile_params, merging_cols
    , merged_df2v2, merged_df2v3)
 
 #============================= end of script
diff --git a/scripts/plotting/lineage_basic_barplot.R b/scripts/plotting/lineage_basic_barplot.R
new file mode 100644
index 0000000..1c91483
--- /dev/null
+++ b/scripts/plotting/lineage_basic_barplot.R
@@ -0,0 +1,218 @@
+#!/usr/bin/env Rscript       
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/plotting/")
+getwd()
+#########################################################
+# TASK: Basic lineage barplot showing numbers
+
+# Output:
+
+##########################################################
+# 				Installing and loading required packages 			 
+##########################################################
+source("Header_TT.R")
+require(data.table)
+source("combining_two_df.R")
+
+#==========================
+# should return the following dfs, directories and variables
+
+# df with NA:
+# merged_df2
+# merged_df3
+
+# df without NA:
+# merged_df2_comp
+# merged_df3_comp
+
+# my_df_u
+
+cat(paste0("Directories imported:"
+           , "\ndatadir:", datadir
+           , "\nindir:", indir
+           , "\noutdir:", outdir
+           , "\nplotdir:", plotdir))
+
+cat(paste0("Variables imported:"
+           , "\ndrug:", drug
+           , "\ngene:", gene
+           , "\ngene_match:", gene_match
+           , "\nAngstrom symbol:", angstroms_symbol
+           , "\nNo. of cols:", df_ncols
+           , "\nNo. of duplicated muts:", dup_muts_nu
+           , "\nNA count for ORs:", na_count
+           , "\nNA count in df2:", na_count_df2
+           , "\nNA count in df3:", na_count_df3))       
+
+#=========================
+#=======
+# output
+#=======
+# plot 1
+basic_bp_lineage = "basic_lineage_barplot.svg"
+plot_basic_bp_lineage  =  paste0(plotdir,"/", basic_bp_lineage)
+
+#=======================================================================
+#================
+# Data for plots:
+# you need merged_df2, comprehensive one
+# since this has one-many relationship
+# i.e the same SNP can belong to multiple lineages
+#================
+# REASSIGNMENT as necessary
+my_df  = merged_df2
+#my_df  = merged_df2_comp
+
+# clear excess variable
+rm(merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+is.factor(my_df$lineage)
+my_df$lineage = as.factor(my_df$lineage)
+is.factor(my_df$lineage)
+
+#==========================
+# Plot: Lineage barplot
+# x = lineage y = No. of samples
+# col = Lineage
+# fill = lineage
+#============================
+table(my_df$lineage)
+
+#****************
+# Plot: Lineage Barplot
+#****************
+
+#=============
+# Data for plots
+#=============
+
+# REASSIGNMENT
+df <- my_df
+
+rm(my_df)
+
+# get freq count of positions so you can subset freq<1
+#setDT(df)[, lineage_count := .N, by = .(lineage)]
+
+#******************
+# generate plot: barplot of mutation by lineage
+#******************
+sel_lineages = c("lineage1"
+                 , "lineage2"
+                 , "lineage3"
+                 , "lineage4"
+                 #, "lineage5"
+                 #, "lineage6"
+                 #, "lineage7"
+                 )
+
+df_lin = subset(df, subset = lineage %in% sel_lineages )
+
+#FIXME; add sanity check for numbers.
+# Done this manually
+
+############################################################
+
+#########
+# Data for barplot: Lineage barplot
+# to show total samples and number of unique mutations 
+# within each linege
+##########
+
+# Create df with lineage inform & no. of unique mutations
+# per lineage and total samples within lineage
+# this is essentially barplot with two y axis
+
+bar = bar = as.data.frame(sel_lineages) #4, 1
+total_snps_u = NULL
+total_samples = NULL
+
+for (i in sel_lineages){
+  #print(i)
+  curr_total = length(unique(df$id)[df$lineage==i])
+  total_samples = c(total_samples, curr_total)
+  print(total_samples)
+  
+  foo = df[df$lineage==i,]
+  print(paste0(i, "======="))
+  print(length(unique(foo$mutationinformation)))
+  curr_count = length(unique(foo$mutationinformation))
+
+  total_snps_u = c(total_snps_u, curr_count)
+}
+
+print(total_snps_u)
+bar$num_snps_u = total_snps_u
+bar$total_samples = total_samples
+bar
+
+#*****************
+# generate plot: lineage barplot with two y-axis
+#https://stackoverflow.com/questions/13035295/overlay-bar-graphs-in-ggplot2
+#*****************
+
+y1 = bar$num_snps_u
+y2 = bar$total_samples
+x = sel_lineages
+
+to_plot = data.frame(x = x
+                      , y1 = y1
+                      , y2 = y2)
+to_plot
+
+# FIXME later: will be depricated!
+melted = melt(to_plot, id = "x")
+melted
+
+
+svg(plot_basic_bp_lineage)
+
+my_ats = 20 # axis text size
+my_als = 22 # axis label size
+
+g = ggplot(melted, aes(x = x
+                 , y = value
+                 , fill = variable))
+
+printFile = g + geom_bar(stat = "identity"
+                         , position = position_stack(reverse = TRUE)
+                         , alpha=.75
+                         , colour='grey75') + 
+  theme(axis.text.x = element_text(size = my_ats)
+        , axis.text.y = element_text(size = my_ats
+                                     #, angle = 30
+                                     , hjust = 1
+                                     , vjust = 0)
+        , axis.title.x = element_text(size = my_als
+                                      , colour = 'black')
+        , axis.title.y = element_text(size = my_als
+                                      , colour = 'black')
+        , legend.position = "top"
+        , legend.text = element_text(size = my_als) + 
+          #geom_text() +
+          geom_label(aes(label = value)
+                     , size = 5
+                     , hjust = 0.5
+                     , vjust = 0.5
+                     , colour = 'black'
+                     , show.legend = FALSE
+                     #, check_overlap = TRUE
+                     , position = position_stack(reverse = T)) + 
+          labs(title = ''
+               , x = ''
+               , y = "Number"
+               , fill = 'Variable'
+               , colour = 'black') + 
+          scale_fill_manual(values = c('grey50', 'gray75')
+                            , name=''
+                            , labels=c('Mutations', 'Total Samples')) +
+          scale_x_discrete(breaks = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
+                           , labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')))
+
+print(printFile)
+dev.off()

From b4affa0c94792f41c1dd1085fb9b70839ada6dba Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Mon, 7 Sep 2020 14:05:46 +0100
Subject: [PATCH 07/27] Combining dfs for PS and lig in one

---
 scripts/plotting/combined_or.R                | 193 ++++++++
 ...ning_two_df.R => combining_dfs_plotting.R} | 229 ++++-----
 scripts/plotting/combining_two_df_FIXME.R     | 442 ------------------
 scripts/plotting/lineage_basic_barplot.R      |  55 +--
 scripts/plotting/lineage_count.txt            |  71 +++
 scripts/plotting/opp_mcsm_muts.R              |  95 ++++
 6 files changed, 464 insertions(+), 621 deletions(-)
 create mode 100644 scripts/plotting/combined_or.R
 rename scripts/plotting/{combining_two_df.R => combining_dfs_plotting.R} (68%)
 delete mode 100755 scripts/plotting/combining_two_df_FIXME.R
 create mode 100644 scripts/plotting/lineage_count.txt
 create mode 100644 scripts/plotting/opp_mcsm_muts.R

diff --git a/scripts/plotting/combined_or.R b/scripts/plotting/combined_or.R
new file mode 100644
index 0000000..4bf2b80
--- /dev/null
+++ b/scripts/plotting/combined_or.R
@@ -0,0 +1,193 @@
+
+#!/usr/bin/env Rscript
+#########################################################
+# TASK: Basic lineage barplot showing numbers
+
+# Output: Basic barplot with lineage samples and mut count
+
+#=======================================================================
+# working dir and loading libraries
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/plotting/")
+getwd()
+
+
+source("Header_TT.R")
+require(cowplot)
+source("combining_dfs_plotting.R")
+# should return the following dfs, directories and variables
+
+# PS combined: 
+# 1) merged_df2
+# 2) merged_df2_comp
+# 3) merged_df3
+# 4) merged_df3_comp
+
+# LIG combined: 
+# 5) merged_df2_lig
+# 6) merged_df2_comp_lig
+# 7) merged_df3_lig
+# 8) merged_df3_comp_lig
+
+# 9) my_df_u
+# 10) my_df_u_lig
+
+cat(paste0("Directories imported:"
+           , "\ndatadir:", datadir
+           , "\nindir:", indir
+           , "\noutdir:", outdir
+           , "\nplotdir:", plotdir))
+
+cat(paste0("Variables imported:"
+           , "\ndrug:", drug
+           , "\ngene:", gene
+           , "\ngene_match:", gene_match
+           , "\nAngstrom symbol:", angstroms_symbol
+           , "\nNo. of duplicated muts:", dup_muts_nu
+           , "\nNA count for ORs:", na_count
+           , "\nNA count in df2:", na_count_df2
+           , "\nNA count in df3:", na_count_df3))       
+
+#=========================
+#=======
+# output
+#=======
+or_combined = "or_combined_PS_LIG.svg"
+plot_or_combined  =  paste0(plotdir,"/", or_combined)
+
+or_kin_combined = "or_kin_combined_PS_LIG.svg"
+plot_or_kin_combined  =  paste0(plotdir,"/", or_kin_combined)
+#=======================================================================
+
+###########################
+# Data for OR and stability plots
+# you need merged_df3_comp
+# since these are matched 
+# to allow pairwise corr
+###########################
+
+ps_df  = merged_df3_comp
+lig_df = merged_df3_comp_lig
+
+# Ensure correct data type in columns to plot: should be TRUE
+is.numeric(ps_df$or_mychisq)
+is.numeric(lig_df$or_mychisq)
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df2_lig, merged_df2_comp_lig, my_df_u, my_df_u_lig)
+#%% end of section 1
+
+# sanity check: should be <10
+if (max(lig_df$ligand_distance) < 10){
+  print ("Sanity check passed: lig data is <10Ang")
+}else{
+  print ("Error: data should be filtered to be within 10Ang")
+}
+
+#############
+# Plots: Bubble plot
+# x = Position, Y = stability
+# size of dots = OR
+# col: stability
+#############
+
+#-----------------
+# Plot 1: DUET vs OR by position as geom_points
+#-------------------
+
+my_ats = 20 # axis text size
+my_als = 22 # axis label size
+
+# Spelling Correction: made redundant as already corrected at the source
+#ps_df$duet_outcome[ps_df$duet_outcome=='Stabilizing'] <- 'Stabilising'
+#ps_df$duet_outcome[ps_df$duet_outcome=='Destabilizing'] <- 'Destabilising'
+
+table(ps_df$duet_outcome) ; sum(table(ps_df$duet_outcome))
+
+g1 = ggplot(ps_df, aes(x = factor(position)
+                   , y = duet_scaled))
+
+p1 = g1 + 
+  geom_point(aes(col = duet_outcome
+                 #, size = or_mychisq))+
+                 , size = or_kin)) +
+  theme(axis.text.x = element_text(size = my_ats
+                                   , angle = 90
+                                   , hjust = 1
+                                   , vjust = 0.4)
+        , axis.text.y = element_text(size = my_ats
+                                     , angle = 0
+                                     , hjust = 1
+                                     , vjust = 0)
+        , axis.title.x = element_text(size = my_als)
+        , axis.title.y = element_text(size = my_als) 
+        , legend.text = element_text(size = my_als)
+        , legend.title = element_text(size = my_als) ) +
+  #, legend.key.size = unit(1, "cm")) +
+  labs(title = ""
+       , x = "Position"
+       , y = "DUET(PS)"
+       , size = "Odds Ratio"
+       , colour = "DUET Outcome") +
+  guides(colour = guide_legend(override.aes = list(size=4))) 
+
+p1 
+
+#-------------------
+# generate plot 2: Lig vs OR by position as geom_points
+#-------------------
+
+# Spelling Correction: made redundant as already corrected at the source
+
+#lig_df$ligand_outcome[lig_df$ligand_outcome=='Stabilizing'] <- 'Stabilising'
+#lig_df$ligand_outcome[lig_df$ligand_outcome=='Destabilizing'] <- 'Destabilising'
+
+table(lig_df$ligand_outcome)
+
+g2 = ggplot(lig_df, aes(x = factor(position)
+                   , y = affinity_scaled))
+
+p2 = g2 + 
+  geom_point(aes(col = ligand_outcome
+                 #, size = or_mychisq))+
+                 , size = or_kin)) +
+  theme(axis.text.x = element_text(size = my_ats
+                                   , angle = 90
+                                   , hjust = 1
+                                   , vjust = 0.4)
+        , axis.text.y = element_text(size = my_ats
+                                     , angle = 0
+                                     , hjust = 1
+                                     , vjust = 0)
+        , axis.title.x = element_text(size = my_als)
+        , axis.title.y = element_text(size = my_als) 
+        , legend.text = element_text(size = my_als)
+        , legend.title = element_text(size = my_als) ) +
+  #, legend.key.size = unit(1, "cm")) +
+  labs(title = ""
+       , x = "Position"
+       , y = "Ligand Affinity"
+       , size = "Odds Ratio"
+       , colour = "Ligand Outcome"
+       ) +
+  guides(colour = guide_legend(override.aes = list(size=4))) 
+
+p2
+
+#======================
+# combine using cowplot
+#======================
+
+svg(plot_or_combined, width = 32, height = 12)
+svg(plot_or_kin_combined, width = 32, height = 12)
+
+theme_set(theme_gray()) # to preserve default theme
+
+printFile = cowplot::plot_grid(plot_grid(p1, p2
+                             , ncol = 1
+                             , align = 'v'
+                             , labels = c("", "")
+                             , label_size = my_als+5))
+print(printFile)
+dev.off()
+
diff --git a/scripts/plotting/combining_two_df.R b/scripts/plotting/combining_dfs_plotting.R
similarity index 68%
rename from scripts/plotting/combining_two_df.R
rename to scripts/plotting/combining_dfs_plotting.R
index 5df2525..7d45d1b 100644
--- a/scripts/plotting/combining_two_df.R
+++ b/scripts/plotting/combining_dfs_plotting.R
@@ -1,8 +1,4 @@
-#!/usr/bin/env Rscript       
-getwd()
-setwd("~/git/LSHTM_analysis/scripts/plotting/")
-getwd()
-
+#!/usr/bin/env Rscript
 #########################################################
 # TASK: To combine struct params and meta data for plotting
 # Input csv files:
@@ -16,21 +12,22 @@ getwd()
 # 3) small combined df including NAs for AF, OR, etc.
 # 		Dim: same as mcsm data
 # 4) large combined df excluding NAs 
-# 		Dim: dim(#1) - no. of NAs(AF|OR) + 1
+# 		Dim: dim(#1) - na_count_df2
 # 5) small combined df excluding NAs
-# 		Dim: dim(#2) - no. of unique NAs - 1
+# 		Dim: dim(#2) - na_count_df3
 # This script is sourced from other .R scripts for plotting
 #########################################################
+#=======================================================================
+# working dir and loading libraries
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/plotting/")
+getwd()
 
-##########################################################
-# 				Installing and loading required packages 			 
-##########################################################
-#source("Header_TT.R")
-require(data.table)
-require(arsenal)
-require(compare)
-library(tidyverse)
-
+source("Header_TT.R")
+#require(data.table)
+#require(arsenal)
+#require(compare)
+#library(tidyverse)
 source("plotting_data.R")
 
 # should return the following dfs, directories and variables
@@ -53,28 +50,13 @@ cat(paste0("Variables imported:"
            , "\nAngstrom symbol:", angstroms_symbol))       
 
 # clear excess variable
-rm(my_df, upos, dup_muts, my_df_u_lig)
+rm(my_df, upos, dup_muts)
 #========================================================
-
-#========================================================
-#%% variable assignment: input and output paths & filenames
-drug = "pyrazinamide"
-gene = "pncA"
-gene_match = paste0(gene,"_p.")
-cat(gene_match)
-
-#=============
-# directories
-#=============
-datadir = paste0("~/git/Data")
-indir = paste0(datadir, "/", drug, "/input")
-outdir = paste0("~/git/Data", "/", drug, "/output")
-plotdir = paste0("~/git/Data", "/", drug, "/output/plots")
 #===========
 # input
 #===========
 #in_file1: output of plotting_data.R
-
+# my_df_u
 
 # infile 2: gene associated meta data
 #in_filename_gene_metadata = paste0(tolower(gene),  "_meta_data_with_AFandOR.csv")
@@ -85,56 +67,22 @@ cat(paste0("Input infile 2:", infile_gene_metadata))
 #===========
 # output
 #===========
-# mutations with opposite effects
-out_filename_opp_muts = paste0(tolower(gene), "_muts_opp_effects.csv")
-outfile_opp_muts = paste0(outdir, "/", out_filename_opp_muts)
+# other variables that you can write
+# primarily called by other scripts for plotting
 
+# PS combined: 
+# 1) merged_df2
+# 2) merged_df2_comp
+# 3) merged_df3
+# 4) merged_df3_comp
+
+# LIG combined: 
+# 5) merged_df2_lig
+# 6) merged_df2_comp_lig
+# 7) merged_df3_lig
+# 8) merged_df3_comp_lig
 
 #%%===============================================================
-table(my_df_u$duet_outcome); sum(table(my_df_u$duet_outcome) )
-
-# spelling Correction 1: DUET incase American spelling needed!
-#my_df_u$duet_outcome[my_df_u$duet_outcome=="Stabilising"] <- "Stabilizing"
-#my_df_u$duet_outcome[my_df_u$duet_outcome=="Destabilising"] <- "Destabilizing"
-
-
-# spelling Correction 2: Ligand incase American spelling needed!
-table(my_df_u$ligand_outcome); sum(table(my_df_u$ligand_outcome) )
-#my_df_u$ligand_outcome[my_df_u$ligand_outcome=="Stabilising"] <- "Stabilizing"
-#my_df_u$ligand_outcome[my_df_u$ligand_outcome=="Destabilising"] <- "Destabilizing"
-
-
-# muts with opposing effects on protomer and ligand stability
-table(my_df_u$duet_outcome != my_df_u$ligand_outcome)
-changes = my_df_u[which(my_df_u$duet_outcome != my_df_u$ligand_outcome),]
-
-# sanity check: redundant, but uber cautious!
-dl_i = which(my_df_u$duet_outcome != my_df_u$ligand_outcome)
-ld_i = which(my_df_u$ligand_outcome != my_df_u$duet_outcome)
-
-cat("Identifying muts with opposite stability effects")
-if(nrow(changes) == (table(my_df_u$duet_outcome != my_df_u$ligand_outcome)[[2]]) & identical(dl_i,ld_i)) {
-  cat("PASS: muts with opposite effects on stability and affinity correctly identified"
-        , "\nNo. of such muts: ", nrow(changes))
-}else {
-  cat("FAIL: unsuccessful in extracting muts with changed stability effects")
-}
-
-#***************************
-# write file: changed muts
-write.csv(changes, outfile_opp_muts)
-
-cat("Finished writing file for muts with opp effects:"
-    , "\nFilename: ", outfile_opp_muts
-    , "\nDim:", dim(changes))
-
-# clear variables
-rm(out_filename_opp_muts, outfile_opp_muts)
-rm(changes, dl_i, ld_i)
-
-# count na in each column
-na_count = sapply(my_df_u, function(y) sum(length(which(is.na(y))))); na_count
-df_ncols = ncol(my_df_u)
 
 ###########################
 # 2: Read file: <gene>_meta data.csv
@@ -146,8 +94,6 @@ gene_metadata <- read.csv(infile_gene_metadata
                       , header = T)
 cat("Dim:", dim(gene_metadata))
 
-#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-# FIXME: remove
 # counting NAs in AF, OR cols: 
 if (identical(sum(is.na(my_df_u$or_mychisq))
               , sum(is.na(my_df_u$pval_fisher))
@@ -176,31 +122,24 @@ if (identical(sum(is.na(my_df_u$or_kin))
       , "\nNA in AF:", sum(is.na(my_df_u$af_kin)))
 }
 
-#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-# clear variables
-rm(in_filename_gene_metadata, infile_gene_metadata)
-
 str(gene_metadata)
 
+###################################################################
+#                           combining: PS
+###################################################################
 # sort by position (same as my_df)
-# earlier it was mutationinformation
-#head(gene_metadata$mutationinformation)
-#gene_metadata = gene_metadata[order(gene_metadata$mutationinformation),]
-##head(gene_metadata$mutationinformation)
-
 head(gene_metadata$position)
 gene_metadata = gene_metadata[order(gene_metadata$position),]
 head(gene_metadata$position)
 
-###########################
-# Merge 1: two dfs with NA
-# merged_df2
-###########################
+#=========================
+# Merge 1: merged_df2
+# dfs with NAs in ORs
+#=========================
 head(my_df_u$mutationinformation)
 head(gene_metadata$mutationinformation)
 
 # Find common columns b/w two df
-# FIXME: mutation has empty cell for some muts
 merging_cols = intersect(colnames(my_df_u), colnames(gene_metadata))
 
 cat(paste0("Merging dfs with NAs: big df (1-many relationship b/w id & mut)"
@@ -214,9 +153,6 @@ table(nchar(my_df_u$wild_type))
 table(nchar(my_df_u$mutant_type))
 table(nchar(my_df_u$position))
 
-#=============
-# merged_df2: gene_metadata + my_df
-#==============
 # all.y  because x might contain non-structural positions!
 merged_df2 = merge(x = gene_metadata
                   , y = my_df_u
@@ -226,9 +162,7 @@ merged_df2 = merge(x = gene_metadata
 cat("Dim of merged_df2: ", dim(merged_df2))
 head(merged_df2$position)
 
-#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-# FIXME: count how many unique muts have entries in meta data
-# should PASS 
+# sanity check 
 cat("Checking nrows in merged_df2")
 if(nrow(gene_metadata) == nrow(merged_df2)){
   cat("PASS: nrow(merged_df2) = nrow (gene associated gene_metadata)"
@@ -243,47 +177,23 @@ if(nrow(gene_metadata) == nrow(merged_df2)){
   meta_muts_u = unique(gene_metadata$mutationinformation)
     # find the index where it differs
   unique(meta_muts_u[! meta_muts_u %in% merged_muts_u])
+  quit()
 }
 
-#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-# sort by position
-head(merged_df2$position)
-merged_df2 = merged_df2[order(merged_df2$position),]
-head(merged_df2$position)
-
-merged_df2v3 = merge(x = gene_metadata
-                   , y = my_df_u
-                   , by = merging_cols
-                   , all = T) 
-
-merged_df2v2 = merge(x = gene_metadata
-                     , y = my_df_u
-                     , by = merging_cols
-                     , all.x = T) 
-#!=!=!=!=!=!=!=!
-#identical(merged_df2, merged_df2v2)
-
-nrow(merged_df2[merged_df2$position==186,])
-#!=!=!=!=!=!=!=!
-
-# should  be False
-identical(merged_df2, merged_df2v2)
-table(merged_df2$position%in%merged_df2v2$position)
-
-#!!!!!!!!!!! check why these differ
-
-#########
-# merge 3b (merged_df3):remove duplicated mutations
+#=========================
+# Merge 2: merged_df3
+# dfs with NAs in ORs
+#
+# Cannot trust lineage, country from this df as the same mutation
+# can have many different lineages
+# but this should be good for the numerical corr plots
+#=========================
+# remove duplicated mutations
 cat("Merging dfs without NAs: small df (removing muts with no AF|OR associated)"
     ,"\nCannot trust lineage info from this"
     ,"\nlinking col: mutationinforamtion"
     ,"\nfilename: merged_df3")
 
-#==#=#=#=#=#=#
-# Cannot trust lineage, country from this df as the same mutation
-# can have many different lineages
-# but this should be good for the numerical corr plots
-#=#=#=#=#=#=#=
 merged_df3 = merged_df2[!duplicated(merged_df2$mutationinformation),] 
 head(merged_df3$position); tail(merged_df3$position) # should be sorted
 
@@ -326,12 +236,10 @@ if ( identical( which(is.na(merged_df2$or_mychisq)), which(is.na(merged_df2$or_k
   quit()
 }
 
-###########################
-# 4: merging two dfs: without NA
-###########################
-#########
-# merge 4a (merged_df2_comp): same as merge 1 but excluding NA
-#########
+#=========================
+# Merge3: merged_df2_comp
+# same as merge 1 but excluding NAs from ORs, etc.
+#=========================
 cat("Merging dfs without any NAs: big df (1-many relationship b/w id & mut)"
     ,"\nlinking col: Mutationinforamtion"
     ,"\nfilename: merged_df2_comp")
@@ -357,9 +265,12 @@ if ( identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin))) ){
   print('Index mismatch for mychisq and kin ors. Aborting NA ommission')
 }
 
-#########
-# merge 4b (merged_df3_comp): remove duplicate mutation information
-#########
+#=========================
+# Merge4: merged_df3_comp
+# same as merge 2 but excluding NAs from ORs, etc or 
+# remove duplicate mutation information
+#=========================
+
 if ( identical( which(is.na(merged_df3$af)), which(is.na(merged_df3$af_kin))) ){
   print('mychisq and kin ors missing indices match. Procedding with omitting NAs')
   na_count_df3 = sum(is.na(merged_df3$af))
@@ -388,7 +299,6 @@ bar = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),]
 all.equal(foo, bar)
 #summary(comparedf(foo, bar))
 
-#=============== end of combining df
 #==============================================================
 #################
 # OPTIONAL: write ALL 4 output files 
@@ -416,7 +326,32 @@ all.equal(foo, bar)
 # clear variables
 rm(foo, bar, gene_metadata
    , in_filename_params, infile_params, merging_cols
+   , in_filename_gene_metadata, infile_gene_metadata
    , merged_df2v2, merged_df2v3)
+#*************************
+#####################################################################
+#                       Combining: LIG
+#####################################################################
 
-#============================= end of script
+#=========================
+# Merges 5-8
+#=========================
 
+merged_df2_lig = merged_df2[merged_df2$ligand_distance<10,]
+merged_df2_comp_lig = merged_df2_comp[merged_df2_comp$ligand_distance<10,]
+
+merged_df3_lig = merged_df3[merged_df3$ligand_distance<10,]
+merged_df3_comp_lig = merged_df3_comp[merged_df3_comp$ligand_distance<10,]
+
+# sanity check
+if (nrow(merged_df3_lig) == nrow(my_df_u_lig)){
+  print("PASS: verified merged_df3_lig")
+}else{
+  cat(paste0('FAIL: nrow mismatch for merged_df3_lig'
+             , "\nExpected:", nrow(my_df_u_lig)
+             , "\nGot:", nrow(merged_df3_lig)))
+}
+
+#==========================================================================
+# end of script
+##==========================================================================
\ No newline at end of file
diff --git a/scripts/plotting/combining_two_df_FIXME.R b/scripts/plotting/combining_two_df_FIXME.R
deleted file mode 100755
index 0d34e06..0000000
--- a/scripts/plotting/combining_two_df_FIXME.R
+++ /dev/null
@@ -1,442 +0,0 @@
-getwd()
-setwd("~/git/LSHTM_analysis/scripts/plotting/")
-getwd()
-
-#########################################################
-# TASK: To combine struct params and meta data for plotting
-# Input csv files:
-# 1) <gene>_all_params.csv
-# 2) <gene>_meta_data.csv
-
-# Output: 
-# 1) muts with opposite effects on stability
-# 2) large combined df including NAs for AF, OR,etc
-# 		Dim: same no. of rows as gene associated meta_data_with_AFandOR
-# 3) small combined df including NAs for AF, OR, etc.
-# 		Dim: same as mcsm data
-# 4) large combined df excluding NAs 
-# 		Dim: dim(#1) - no. of NAs(AF|OR) + 1
-# 5) small combined df excluding NAs
-# 		Dim: dim(#2) - no. of unique NAs - 1
-# This script is sourced from other .R scripts for plotting
-#########################################################
-
-##########################################################
-# 				Installing and loading required packages 			 
-##########################################################
-source("Header_TT.R")
-#require(data.table)
-#require(arsenal)
-#require(compare)
-#library(tidyverse)
-
-
-#%% variable assignment: input and output paths & filenames
-drug = "pyrazinamide"
-gene = "pncA"
-gene_match = paste0(gene,"_p.")
-cat(gene_match)
-
-#=============
-# directories
-#=============
-datadir = paste0("~/git/Data")
-indir = paste0(datadir, "/", drug, "/input")
-outdir = paste0("~/git/Data", "/", drug, "/output")
-
-#===========
-# input
-#===========
-#in_filename = "mcsm_complex1_normalised.csv"
-in_filename_params = paste0(tolower(gene), "_all_params.csv") 
-infile_params = paste0(outdir, "/", in_filename_params)
-cat(paste0("Input file 1:", infile_params) )
-
-# infile 2: gene associated meta data
-#in_filename_gene_metadata = paste0(tolower(gene),  "_meta_data_with_AFandOR.csv")
-in_filename_gene_metadata = paste0(tolower(gene),  "_metadata.csv")
-infile_gene_metadata = paste0(outdir, "/", in_filename_gene_metadata)
-cat(paste0("Input infile 2:", infile_gene_metadata))
-
-#===========
-# output
-#===========
-# mutations with opposite effects
-out_filename_opp_muts = paste0(tolower(gene), "_muts_opp_effects.csv")
-outfile_opp_muts = paste0(outdir, "/", out_filename_opp_muts)
-
-
-#%%===============================================================
-###########################
-# Read file: struct params
-###########################
-cat("Reading struct params including mcsm:"
-    , in_filename_params)
-
-mcsm_data = read.csv(infile_params
-                     #, row.names = 1
-                     , stringsAsFactors = F
-                     , header = T) 
-
-cat("Input dimensions:", dim(mcsm_data)) #416, 86
-
-# clear variables
-rm(in_filename_params, infile_params)
-
-str(mcsm_data)
-
-table(mcsm_data$duet_outcome); sum(table(mcsm_data$duet_outcome) )
-
-# spelling Correction 1: DUET incase American spelling needed!
-#mcsm_data$duet_outcome[mcsm_data$duet_outcome=="Stabilising"] <- "Stabilizing"
-#mcsm_data$duet_outcome[mcsm_data$duet_outcome=="Destabilising"] <- "Destabilizing"
-
-# checks: should be the same as above
-table(mcsm_data$duet_outcome); sum(table(mcsm_data$duet_outcome) )
-head(mcsm_data$duet_outcome); tail(mcsm_data$duet_outcome)
-
-# spelling Correction 2: Ligand incase American spelling needed!
-table(mcsm_data$ligand_outcome); sum(table(mcsm_data$ligand_outcome) )
-#mcsm_data$ligand_outcome[mcsm_data$ligand_outcome=="Stabilising"] <- "Stabilizing"
-#mcsm_data$ligand_outcome[mcsm_data$ligand_outcome=="Destabilising"] <- "Destabilizing"
-
-# checks: should be the same as above
-table(mcsm_data$ligand_outcome); sum(table(mcsm_data$ligand_outcome) )
-head(mcsm_data$ligand_outcome); tail(mcsm_data$ligand_outcome)
-
-# muts with opposing effects on protomer and ligand stability
-table(mcsm_data$duet_outcome != mcsm_data$ligand_outcome)
-changes = mcsm_data[which(mcsm_data$duet_outcome != mcsm_data$ligand_outcome),]
-
-# sanity check: redundant, but uber cautious!
-dl_i = which(mcsm_data$duet_outcome != mcsm_data$ligand_outcome)
-ld_i = which(mcsm_data$ligand_outcome != mcsm_data$duet_outcome)
-
-cat("Identifying muts with opposite stability effects")
-if(nrow(changes) == (table(mcsm_data$duet_outcome != mcsm_data$ligand_outcome)[[2]]) & identical(dl_i,ld_i)) {
-  cat("PASS: muts with opposite effects on stability and affinity correctly identified"
-        , "\nNo. of such muts: ", nrow(changes))
-}else {
-  cat("FAIL: unsuccessful in extracting muts with changed stability effects")
-}
-
-#***************************
-# write file: changed muts
-write.csv(changes, outfile_opp_muts)
-
-cat("Finished writing file for muts with opp effects:"
-    , "\nFilename: ", outfile_opp_muts
-    , "\nDim:", dim(changes))
-
-# clear variables
-rm(out_filename_opp_muts, outfile_opp_muts)
-rm(changes, dl_i, ld_i)
-
-#***************************
-# count na in each column
-na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count
-
-# sort by mutationinformation
-##mcsm_data = mcsm_data[order(mcsm_data$mutationinformation),]
-##head(mcsm_data$mutationinformation)
-
-df_ncols = ncol(mcsm_data)
-
-# REMOVE as this is dangerous due to dup muts
-# get freq count of positions and add to the df 
-#setDT(mcsm_data)[, occurrence := .N, by = .(position)] 
-
-#cat("Added 1 col: position frequency to see which posn has how many muts"
-#    , "\nNo. of cols now", ncol(mcsm_data)
-#    , "\nNo. of cols before: ", df_ncols)
-
-#pos_count_check = data.frame(mcsm_data$position, mcsm_data$occurrence)
-
-# check duplicate muts
-if (length(unique(mcsm_data$mutationinformation)) == length(mcsm_data$mutationinformation)){
-  cat("No duplicate mutations in mcsm data")
-}else{
-  dup_muts = mcsm_data[duplicated(mcsm_data$mutationinformation),]
-  dup_muts_nu = length(unique(dup_muts$mutationinformation))
-  cat(paste0("CAUTION:", nrow(dup_muts), " Duplicate mutations identified"
-             , "\nOf these, no. of unique mutations are:", dup_muts_nu
-             , "\nExtracting df with unique mutations only"))
-  mcsm_data_u = mcsm_data[!duplicated(mcsm_data$mutationinformation),]
-}
-
-if (nrow(mcsm_data_u) == length(unique(mcsm_data$mutationinformation))){
-  cat("Df without duplicate mutations successfully extracted")
-} else{
-  cat("FAIL: could not extract clean df!")
-  quit()
-}
-
-###########################
-# 2: Read file: <gene>_meta data.csv
-###########################
-cat("Reading meta data file:", infile_gene_metadata)
-
-gene_metadata <- read.csv(infile_gene_metadata
-                      , stringsAsFactors = F
-                      , header = T)
-cat("Dim:", dim(gene_metadata))
-
-#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-# FIXME: remove
-# counting NAs in AF, OR cols: 
-if (identical(sum(is.na(gene_metadata$OR))
-              , sum(is.na(gene_metadata$pvalue))
-              , sum(is.na(gene_metadata$AF)))){
-  cat("PASS: NA count match for OR, pvalue and AF\n")
-  na_count = sum(is.na(gene_metadata$AF))
-  cat("No. of NAs: ", sum(is.na(gene_metadata$OR)))
-} else{
-  cat("FAIL: NA count mismatch"
-      , "\nNA in OR: ", sum(is.na(gene_metadata$OR))
-      , "\nNA in pvalue: ", sum(is.na(gene_metadata$pvalue))
-      , "\nNA in AF:", sum(is.na(gene_metadata$AF)))
-}
-#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-# clear variables
-rm(in_filename_gene_metadata, infile_gene_metadata)
-
-str(gene_metadata)
-
-# sort by position (same as mcsm_data)
-# earlier it was mutationinformation
-#head(gene_metadata$mutationinformation)
-#gene_metadata = gene_metadata[order(gene_metadata$mutationinformation),]
-##head(gene_metadata$mutationinformation)
-
-head(gene_metadata$position)
-gene_metadata = gene_metadata[order(gene_metadata$position),]
-head(gene_metadata$position)
-
-###########################
-# Merge 1: two dfs with NA
-# merged_df2
-###########################
-head(mcsm_data$mutationinformation)
-head(gene_metadata$mutationinformation)
-
-# Find common columns b/w two df
-merging_cols = intersect(colnames(mcsm_data), colnames(gene_metadata))
-
-cat(paste0("Merging dfs with NAs: big df (1-many relationship b/w id & mut)"
-           , "\nNo. of merging cols:", length(merging_cols)
-           , "\nMerging columns identified:"))
-print(merging_cols)
-
-#=============
-# merged_df2): gene_metadata + mcsm_data
-#==============
-merged_df2 = merge(x = gene_metadata
-                  , y = mcsm_data
-                  , by = merging_cols
-                  , all.y = T)
-
-cat("Dim of merged_df2: ", dim(merged_df2) #4520, 11
-    )
-head(merged_df2$position)
-
-#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-# FIXME: count how many unique muts have entries in meta data
-# sanity check
-cat("Checking nrows in merged_df2")
-if(nrow(gene_metadata) == nrow(merged_df2)){
-  cat("nrow(merged_df2) = nrow (gene associated gene_metadata)"
-      ,"\nExpected no. of rows: ",nrow(gene_metadata) 
-      ,"\nGot no. of rows: ", nrow(merged_df2))
-} else{
-  cat("nrow(merged_df2)!= nrow(gene associated gene_metadata)"
-      , "\nExpected no. of rows after merge: ", nrow(gene_metadata)
-      , "\nGot no. of rows: ", nrow(merged_df2)
-      , "\nFinding discrepancy")
-  merged_muts_u = unique(merged_df2$mutationinformation)
-  meta_muts_u = unique(gene_metadata$mutationinformation)
-    # find the index where it differs
-  unique(meta_muts_u[! meta_muts_u %in% merged_muts_u])
-}
-
-#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-
-# sort by position
-head(merged_df2$position)
-merged_df2 = merged_df2[order(merged_df2$position),]
-head(merged_df2$position)
-
-merged_df2v3 = merge(x = gene_metadata
-                   , y = mcsm_data
-                   , by = merging_cols
-                   , all = T) 
-
-merged_df2v2 = merge(x = gene_metadata
-                     , y = mcsm_data
-                     , by = merging_cols
-                     , all.x = T) 
-#!=!=!=!=!=!=!=!
-# COMMENT: used all.y since position 186 is not part of the struc,
-# hence doesn"t have a mcsm value
-# but 186 is associated with mutation
-#!=!=!=!=!=!=!=!
-
-# should  be False
-identical(merged_df2, merged_df2v2)
-table(merged_df2$position%in%merged_df2v2$position)
-
-rm(merged_df2v2)
-
-#!!!!!!!!!!! check why these differ
-
-#########
-# merge 3b (merged_df3):remove duplicate mutation information
-#########
-cat("Merging dfs without NAs: small df (removing muts with no AF|OR associated)"
-    ,"\nCannot trust lineage info from this"
-    ,"\nlinking col: Mutationinforamtion"
-    ,"\nfilename: merged_df3")
-
-#==#=#=#=#=#=#
-# Cannot trust lineage, country from this df as the same mutation
-# can have many different lineages
-# but this should be good for the numerical corr plots
-#=#=#=#=#=#=#=
-merged_df3 = merged_df2[!duplicated(merged_df2$mutationinformation),] 
-head(merged_df3$position); tail(merged_df3$position) # should be sorted
-
-# sanity check
-cat("Checking nrows in merged_df3")
-if(nrow(mcsm_data) == nrow(merged_df3)){
-  cat("PASS: No. of rows match with mcsm_data"
-      ,"\nExpected no. of rows: ", nrow(mcsm_data)
-      ,"\nGot no. of rows: ", nrow(merged_df3))
-} else {
-  cat("FAIL: No. of rows mismatch"
-      , "\nNo. of rows mcsm_data: ", nrow(mcsm_data)
-      , "\nNo. of rows merged_df3: ", nrow(merged_df3))
-}
-
-# counting NAs in AF, OR cols in merged_df3
-# this is becuase mcsm has no AF, OR cols,
-# so you cannot count NAs
-if (identical(sum(is.na(merged_df3$OR))
-              , sum(is.na(merged_df3$pvalue))
-              , sum(is.na(merged_df3$AF)))){
-  cat("PASS: NA count match for OR, pvalue and AF\n")
-  na_count_df3 = sum(is.na(merged_df3$AF))
-  cat("No. of NAs: ", sum(is.na(merged_df3$OR)))
-} else{
-  cat("FAIL: NA count mismatch"
-      , "\nNA in OR: ", sum(is.na(merged_df3$OR))
-      , "\nNA in pvalue: ", sum(is.na(merged_df3$pvalue))
-      , "\nNA in AF:", sum(is.na(merged_df3$AF)))
-}
-
-###########################
-# 4: merging two dfs: without NA
-###########################
-#########
-# merge 4a (merged_df2_comp): same as merge 1 but excluding NA
-#########
-cat("Merging dfs without any NAs: big df (1-many relationship b/w id & mut)"
-    ,"\nlinking col: Mutationinforamtion"
-    ,"\nfilename: merged_df2_comp")
-
-merged_df2_comp = merged_df2[!is.na(merged_df2$AF),] 
-#merged_df2_comp = merged_df2[!duplicated(merged_df2$mutationinformation),] 
-
-# sanity check
-cat("Checking nrows in merged_df2_comp")
-if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count + 1)){
-  cat("PASS: No. of rows match"
-      ,"\nDim of merged_df2_comp: "
-      ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count + 1
-      , "\nNo. of rows: ", nrow(merged_df2_comp)
-      , "\nNo. of cols: ", ncol(merged_df2_comp))
-}else{
-  cat("FAIL: No. of rows mismatch"
-      ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count + 1
-      ,"\nGot no. of rows: ", nrow(merged_df2_comp))
-}
-
-#########
-# merge 4b (merged_df3_comp): remove duplicate mutation information
-#########
-merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$mutationinformation),] 
-
-cat("Dim of merged_df3_comp: "
-    , "\nNo. of rows: ", nrow(merged_df3_comp)
-    , "\nNo. of cols: ", ncol(merged_df3_comp))
-
-# alternate way of deriving merged_df3_comp
-foo = merged_df3[!is.na(merged_df3$AF),]
-# compare dfs: foo and merged_df3_com
-all.equal(foo, merged_df3)
-
-summary(comparedf(foo, merged_df3))
-
-# sanity check
-cat("Checking nrows in merged_df3_comp")
-if(nrow(merged_df3_comp) == nrow(merged_df3)){
-  cat("NO NAs detected in merged_df3 in AF|OR cols"
-      ,"\nNo. of rows are identical: ", nrow(merged_df3))
-} else{
-  if(nrow(merged_df3_comp) == nrow(merged_df3) - na_count_df3) {
-  cat("PASS: NAs detected in merged_df3 in AF|OR cols"
-      , "\nNo. of NAs: ", na_count_df3
-      , "\nExpected no. of rows in merged_df3_comp: ", nrow(merged_df3) - na_count_df3
-      , "\nGot no. of rows: ", nrow(merged_df3_comp))
-  }
-}
-
-#=============== end of combining df
-#*********************
-# writing 1 file in the style of a loop: merged_df3
-# print(output dir)
-#i = "merged_df3"
-#out_filename = paste0(i, ".csv")
-#outfile = paste0(outdir, "/", out_filename)
-
-#cat("Writing output file: "
-#    ,"\nFilename: ", out_filename
-#    ,"\nPath: ", outdir)
-
-#template: write.csv(merged_df3, "merged_df3.csv")
-#write.csv(get(i), outfile, row.names = FALSE)
-#cat("Finished writing: ", outfile
-#    , "\nNo. of rows: ", nrow(get(i))
-#    , "\nNo. of cols: ", ncol(get(i)))
-
-#%% write_output files;  all 4 files: 
-outvars = c("merged_df2"
-             , "merged_df3"
-             , "merged_df2_comp"
-             , "merged_df3_comp")
-
-cat("Writing output files: "
-    , "\nPath:", outdir)
-
-for (i in outvars){
-#  cat(i, "\n")
-  out_filename = paste0(i, ".csv")
-#  cat(out_filename, "\n")
-#  cat("getting value of variable: ", get(i))
-  outfile = paste0(outdir, "/", out_filename)
-#  cat("Full output path: ", outfile, "\n")
-  cat("Writing output file:"
-      ,"\nFilename: ", out_filename,"\n")
-  write.csv(get(i), outfile, row.names = FALSE)
-  cat("Finished writing: ", outfile
-      , "\nNo. of rows: ", nrow(get(i))
-      , "\nNo. of cols: ", ncol(get(i)), "\n")
-}
-
-# alternate way to replace with implicit loop 
-# FIXME
-#sapply(outvars, function(x, y) write.csv(get(outvars), paste0(outdir, "/", outvars, ".csv")))
-#*************************
-# clear variables
-rm(mcsm_data, gene_metadata, foo, drug, gene, gene_match, indir, merged_muts_u, meta_muts_u, na_count, df_ncols, outdir)
-rm(pos_count_check)
-#============================= end of script
-
diff --git a/scripts/plotting/lineage_basic_barplot.R b/scripts/plotting/lineage_basic_barplot.R
index 1c91483..8b107bb 100644
--- a/scripts/plotting/lineage_basic_barplot.R
+++ b/scripts/plotting/lineage_basic_barplot.R
@@ -5,27 +5,30 @@ getwd()
 #########################################################
 # TASK: Basic lineage barplot showing numbers
 
-# Output:
+# Output: Basic barplot with lineage samples and mut count
 
 ##########################################################
 # 				Installing and loading required packages 			 
 ##########################################################
 source("Header_TT.R")
 require(data.table)
-source("combining_two_df.R")
-
-#==========================
+source("combining_dfs_plotting.R")
 # should return the following dfs, directories and variables
 
-# df with NA:
-# merged_df2
-# merged_df3
+# PS combined: 
+# 1) merged_df2
+# 2) merged_df2_comp
+# 3) merged_df3
+# 4) merged_df3_comp
 
-# df without NA:
-# merged_df2_comp
-# merged_df3_comp
+# LIG combined: 
+# 5) merged_df2_lig
+# 6) merged_df2_comp_lig
+# 7) merged_df3_lig
+# 8) merged_df3_comp_lig
 
-# my_df_u
+# 9) my_df_u
+# 10) my_df_u_lig
 
 cat(paste0("Directories imported:"
            , "\ndatadir:", datadir
@@ -38,13 +41,16 @@ cat(paste0("Variables imported:"
            , "\ngene:", gene
            , "\ngene_match:", gene_match
            , "\nAngstrom symbol:", angstroms_symbol
-           , "\nNo. of cols:", df_ncols
            , "\nNo. of duplicated muts:", dup_muts_nu
            , "\nNA count for ORs:", na_count
            , "\nNA count in df2:", na_count_df2
            , "\nNA count in df3:", na_count_df3))       
 
-#=========================
+#===========
+# input
+#===========
+# output of combining_dfs_plotting.R
+
 #=======
 # output
 #=======
@@ -82,15 +88,11 @@ is.factor(my_df$lineage)
 # fill = lineage
 #============================
 table(my_df$lineage)
-
-#****************
-# Plot: Lineage Barplot
-#****************
+as.data.frame(table(my_df$lineage))
 
 #=============
 # Data for plots
 #=============
-
 # REASSIGNMENT
 df <- my_df
 
@@ -111,18 +113,7 @@ sel_lineages = c("lineage1"
                  #, "lineage7"
                  )
 
-df_lin = subset(df, subset = lineage %in% sel_lineages )
-
-#FIXME; add sanity check for numbers.
-# Done this manually
-
-############################################################
-
-#########
-# Data for barplot: Lineage barplot
-# to show total samples and number of unique mutations 
-# within each linege
-##########
+df_lin = subset(df, subset = lineage %in% sel_lineages)
 
 # Create df with lineage inform & no. of unique mutations
 # per lineage and total samples within lineage
@@ -193,7 +184,7 @@ printFile = g + geom_bar(stat = "identity"
         , axis.title.y = element_text(size = my_als
                                       , colour = 'black')
         , legend.position = "top"
-        , legend.text = element_text(size = my_als) + 
+        , legend.text = element_text(size = my_als)) + 
           #geom_text() +
           geom_label(aes(label = value)
                      , size = 5
@@ -212,7 +203,7 @@ printFile = g + geom_bar(stat = "identity"
                             , name=''
                             , labels=c('Mutations', 'Total Samples')) +
           scale_x_discrete(breaks = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
-                           , labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')))
+                           , labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4'))
 
 print(printFile)
 dev.off()
diff --git a/scripts/plotting/lineage_count.txt b/scripts/plotting/lineage_count.txt
new file mode 100644
index 0000000..fe6a87d
--- /dev/null
+++ b/scripts/plotting/lineage_count.txt
@@ -0,0 +1,71 @@
+#=============
+# merged_df2
+#=============
+----------------
+# no. of samples
+----------------
+                  Var1 Freq
+1                         8
+2             lineage1  144
+3    lineage1;lineage2    3
+4    lineage1;lineage4    4
+5             lineage2 1886
+6    lineage2;lineage4   19
+7             lineage3  190
+8    lineage3;lineage4   11
+9             lineage4 2213
+10   lineage4;lineage6    1
+11   lineage4;lineage7    1
+12 lineage4;lineageBOV    1
+13            lineage5   31
+14            lineage6    9
+15            lineage7    3
+16          lineageBOV  392
+
+----------------
+# no. of nsSNPs
+----------------
+
+  sel_lineages num_snps_u total_samples
+1     lineage1         74           144
+2     lineage2        277          1886
+3     lineage3        104           190
+4     lineage4        311          2213
+5     lineage5         18            31
+6     lineage6          8             9
+7     lineage7          1             3
+
+
+#=============
+# merged_df2_comp
+#=============
+----------------
+# no. of samples
+----------------
+
+                  Var1 Freq
+1                         3
+2             lineage1  108
+3    lineage1;lineage2    2
+4    lineage1;lineage4    2
+5             lineage2 1497
+6    lineage2;lineage4   13
+7             lineage3  154
+8    lineage3;lineage4    3
+9             lineage4 1846
+10 lineage4;lineageBOV    1
+11            lineage5   12
+12            lineage6    2
+13          lineageBOV  36
+
+----------------
+# no. of nsSNPs
+----------------
+  sel_lineages num_snps_u total_samples
+1     lineage1         42           108
+2     lineage2        141          1497
+3     lineage3         75           154
+4     lineage4        148          1846
+5     lineage5          9            12
+6     lineage6          2             2
+7     lineage7          0             0
diff --git a/scripts/plotting/opp_mcsm_muts.R b/scripts/plotting/opp_mcsm_muts.R
new file mode 100644
index 0000000..5b02f34
--- /dev/null
+++ b/scripts/plotting/opp_mcsm_muts.R
@@ -0,0 +1,95 @@
+#!/usr/bin/env Rscript      
+#########################################################
+# TASK: To write muts with opposite effects on
+# protomer and ligand stability
+#########################################################
+# working dir and loading libraries
+
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/plotting/")
+getwd()
+
+source("plotting_data.R")
+
+# should return the following dfs, directories and variables
+# my_df
+# my_df_u
+# my_df_u_lig
+# dup_muts
+
+cat(paste0("Directories imported:"
+           , "\ndatadir:", datadir
+           , "\nindir:", indir
+           , "\noutdir:", outdir
+           , "\nplotdir:", plotdir))
+
+cat(paste0("Variables imported:"
+           , "\ndrug:", drug
+           , "\ngene:", gene
+           , "\ngene_match:", gene_match
+           , "\nLength of upos:", length(upos)
+           , "\nAngstrom symbol:", angstroms_symbol))       
+
+# clear excess variable
+rm(my_df, upos, dup_muts)
+#========================================================
+#===========
+# input
+#===========
+#in_file1: output of plotting_data.R
+# my_df_u
+
+# output
+#===========
+# mutations with opposite effects
+out_filename_opp_muts = paste0(tolower(gene), "_muts_opp_effects.csv")
+outfile_opp_muts = paste0(outdir, "/", out_filename_opp_muts)
+
+#%%===============================================================
+
+# spelling Correction 1: DUET incase American spelling needed!
+table(my_df_u$duet_outcome); sum(table(my_df_u$duet_outcome) )
+#my_df_u$duet_outcome[my_df_u$duet_outcome=="Stabilising"] <- "Stabilizing"
+#my_df_u$duet_outcome[my_df_u$duet_outcome=="Destabilising"] <- "Destabilizing"
+
+
+# spelling Correction 2: Ligand incase American spelling needed!
+table(my_df_u$ligand_outcome); sum(table(my_df_u$ligand_outcome) )
+#my_df_u$ligand_outcome[my_df_u$ligand_outcome=="Stabilising"] <- "Stabilizing"
+#my_df_u$ligand_outcome[my_df_u$ligand_outcome=="Destabilising"] <- "Destabilizing"
+
+
+# muts with opposing effects on protomer and ligand stability
+table(my_df_u$duet_outcome != my_df_u$ligand_outcome)
+changes = my_df_u[which(my_df_u$duet_outcome != my_df_u$ligand_outcome),]
+
+# sanity check: redundant, but uber cautious!
+dl_i = which(my_df_u$duet_outcome != my_df_u$ligand_outcome)
+ld_i = which(my_df_u$ligand_outcome != my_df_u$duet_outcome)
+
+cat("Identifying muts with opposite stability effects")
+if(nrow(changes) == (table(my_df_u$duet_outcome != my_df_u$ligand_outcome)[[2]]) & identical(dl_i,ld_i)) {
+  cat("PASS: muts with opposite effects on stability and affinity correctly identified"
+        , "\nNo. of such muts: ", nrow(changes))
+}else {
+  cat("FAIL: unsuccessful in extracting muts with changed stability effects")
+}
+
+#==========================
+# write file: changed muts
+#==========================
+write.csv(changes, outfile_opp_muts)
+
+cat("Finished writing file for muts with opp effects:"
+    , "\nFilename: ", outfile_opp_muts
+    , "\nDim:", dim(changes))
+
+# clear variables
+rm(out_filename_opp_muts, outfile_opp_muts)
+rm(changes, dl_i, ld_i)
+
+# count na in each column
+na_count = sapply(my_df_u, function(y) sum(length(which(is.na(y))))); na_count
+df_ncols = ncol(my_df_u)
+
+#===================================== end of script

From 648be02665dbc9102707393a0cdf3944ec784ac7 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Mon, 7 Sep 2020 15:27:53 +0100
Subject: [PATCH 08/27] ks test script added

---
 scripts/ks_test_PS.R               | 211 +++++++++++++++++++++++++++++
 scripts/plotting/autoviz.py        |  45 ------
 scripts/plotting/combined_or.R     |   1 -
 scripts/plotting/lineage_count.txt |  71 ----------
 4 files changed, 211 insertions(+), 117 deletions(-)
 create mode 100644 scripts/ks_test_PS.R
 delete mode 100644 scripts/plotting/autoviz.py
 delete mode 100644 scripts/plotting/lineage_count.txt

diff --git a/scripts/ks_test_PS.R b/scripts/ks_test_PS.R
new file mode 100644
index 0000000..bc11957
--- /dev/null
+++ b/scripts/ks_test_PS.R
@@ -0,0 +1,211 @@
+#!/usr/bin/env Rscript
+#########################################################
+# TASK: KS test for PS/DUET lineage distributions
+#=======================================================================
+#=======================================================================
+# working dir and loading libraries
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/")
+getwd()
+
+#source("/plotting/Header_TT.R")
+#source("../barplot_colour_function.R")
+#require(data.table)
+source("plotting/combining_dfs_plotting.R")
+# should return the following dfs, directories and variables
+
+# PS combined: 
+# 1) merged_df2
+# 2) merged_df2_comp
+# 3) merged_df3
+# 4) merged_df3_comp
+
+# LIG combined: 
+# 5) merged_df2_lig
+# 6) merged_df2_comp_lig
+# 7) merged_df3_lig
+# 8) merged_df3_comp_lig
+
+# 9) my_df_u
+# 10) my_df_u_lig
+
+cat(paste0("Directories imported:"
+           , "\ndatadir:", datadir
+           , "\nindir:", indir
+           , "\noutdir:", outdir
+           , "\nplotdir:", plotdir))
+
+cat(paste0("Variables imported:"
+           , "\ndrug:", drug
+           , "\ngene:", gene
+           , "\ngene_match:", gene_match
+           , "\nAngstrom symbol:", angstroms_symbol
+           , "\nNo. of duplicated muts:", dup_muts_nu
+           , "\nNA count for ORs:", na_count
+           , "\nNA count in df2:", na_count_df2
+           , "\nNA count in df3:", na_count_df3))       
+
+###########################
+# Data for  stats
+# you need merged_df2 or merged_df2_comp
+# since this is one-many relationship 
+# i.e the same SNP can belong to multiple lineages
+# using the _comp dataset means
+# we lose some muts and at this level, we should use
+# as much info as available, hence use df with NA
+###########################
+
+# REASSIGNMENT
+my_df  = merged_df2
+
+# delete variables not required
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# quick checks
+colnames(my_df)
+str(my_df)
+
+# Ensure correct data type in columns to plot: need to be factor
+is.factor(my_df$lineage)
+my_df$lineage = as.factor(my_df$lineage)
+is.factor(my_df$lineage)
+
+table(my_df$mutation_info); str(my_df$mutation_info)
+
+# subset df with dr muts only
+my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide") 
+table(my_df_dr$mutation_info)
+
+# stats for all muts and dr_muts
+# 1) for all muts
+# 2) for dr_muts
+#===========================
+table(my_df$lineage); str(my_df$lineage)
+table(my_df_dr$lineage); str(my_df_dr$lineage)
+
+# subset only lineages1-4
+sel_lineages = c("lineage1"
+                 , "lineage2"
+                 , "lineage3"
+                 , "lineage4")
+
+# subset and refactor: all muts
+df_lin = subset(my_df, subset = lineage %in% sel_lineages)
+df_lin$lineage = factor(df_lin$lineage)
+
+# subset and refactor: dr muts
+df_lin_dr = subset(my_df_dr, subset = lineage %in% sel_lineages)
+df_lin_dr$lineage = factor(df_lin_dr$lineage)
+
+
+#{RESULT: No of samples within lineage}
+table(df_lin$lineage) 
+table(df_lin_dr$lineage)
+
+#{Result: No. of unique mutations the 4 lineages contribute to}
+length(unique(df_lin$mutationinformation))
+length(unique(df_lin_dr$mutationinformation))
+
+
+# COMPARING DISTRIBUTIONS
+#================
+# ALL mutations
+#=================
+head(df_lin$lineage)
+df_lin$lineage = as.character(df_lin$lineage)
+
+lin1 = df_lin[df_lin$lineage == "lineage1",]$duet_scaled
+lin2 = df_lin[df_lin$lineage == "lineage2",]$duet_scaled
+lin3 = df_lin[df_lin$lineage == "lineage3",]$duet_scaled
+lin4 = df_lin[df_lin$lineage == "lineage4",]$duet_scaled
+
+# ks test
+lin12 = ks.test(lin1,lin2) 
+lin12_df = as.data.frame(cbind(lin12$data.name, lin12$p.value))
+  
+lin13 = ks.test(lin1,lin3) 
+lin13_df = as.data.frame(cbind(lin13$data.name, lin13$p.value))
+
+lin14 = ks.test(lin1,lin4) 
+lin14_df = as.data.frame(cbind(lin14$data.name, lin14$p.value))
+
+lin23 = ks.test(lin2,lin3)
+lin23_df = as.data.frame(cbind(lin23$data.name, lin23$p.value))
+
+lin24 = ks.test(lin2,lin4)  
+lin24_df = as.data.frame(cbind(lin24$data.name, lin24$p.value))
+
+lin34 = ks.test(lin3,lin4)
+lin34_df = as.data.frame(cbind(lin34$data.name, lin34$p.value))
+
+ks_results_all = rbind(lin12_df 
+                   , lin13_df
+                   , lin14_df
+                   , lin23_df
+                   , lin24_df
+                   , lin34_df)
+
+#p-value < 2.2e-16
+rm(lin12, lin12_df
+   , lin13, lin13_df
+   , lin14, lin14_df
+   , lin23, lin23_df
+   , lin24, lin24_df
+   , lin34, lin34_df)
+
+#================
+# DRUG mutations
+#=================
+head(df_lin_dr$lineage)
+df_lin_dr$lineage = as.character(df_lin_dr$lineage)
+
+lin1_dr = df_lin_dr[df_lin_dr$lineage == "lineage1",]$duet_scaled
+lin2_dr = df_lin_dr[df_lin_dr$lineage == "lineage2",]$duet_scaled
+lin3_dr = df_lin_dr[df_lin_dr$lineage == "lineage3",]$duet_scaled
+lin4_dr = df_lin_dr[df_lin_dr$lineage == "lineage4",]$duet_scaled
+
+# ks test: dr muts
+lin12_dr = ks.test(lin1_dr,lin2_dr) 
+lin12_df_dr = as.data.frame(cbind(lin12_dr$data.name, lin12_dr$p.value))
+
+lin13_dr = ks.test(lin1_dr,lin3_dr) 
+lin13_df_dr = as.data.frame(cbind(lin13_dr$data.name, lin13_dr$p.value))
+
+lin14_dr = ks.test(lin1_dr,lin4_dr) 
+lin14_df_dr = as.data.frame(cbind(lin14_dr$data.name, lin14_dr$p.value))
+
+lin23_dr = ks.test(lin2_dr,lin3_dr)
+lin23_df_dr = as.data.frame(cbind(lin23_dr$data.name, lin23_dr$p.value))
+
+lin24_dr = ks.test(lin2_dr,lin4_dr)  
+lin24_df_dr = as.data.frame(cbind(lin24_dr$data.name, lin24_dr$p.value))
+
+lin34_dr = ks.test(lin3_dr,lin4_dr)
+lin34_df_dr = as.data.frame(cbind(lin34_dr$data.name, lin34_dr$p.value))
+
+ks_results_dr = rbind(lin12_df_dr 
+                      , lin13_df_dr
+                      , lin14_df_dr
+                      , lin23_df_dr
+                      , lin24_df_dr
+                      , lin34_df_dr)
+
+ks_results_combined = cbind(ks_results_all, ks_results_dr)
+
+my_colnames = c("Lineage_comparisons"
+                , paste0("All_mutations n=", nrow(df_lin))
+                , paste0("Drug_associated_mutations n=", nrow(df_lin_dr)))
+my_colnames
+
+# select the output columns
+ks_results_combined_f = ks_results_combined[,c(1,2,4)]
+
+colnames(ks_results_combined_f) = my_colnames
+ks_results_combined_f 
+
+#=============
+# write output file
+#=============
+ks_results = paste0(outdir,"/results/ks_results.csv")
+write.csv(ks_results_combined_f, ks_results, row.names = F)
+
diff --git a/scripts/plotting/autoviz.py b/scripts/plotting/autoviz.py
deleted file mode 100644
index 8dc6405..0000000
--- a/scripts/plotting/autoviz.py
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/usr/bin/env python3
-#=======================================================================
-#%% useful links
-#https://towardsdatascience.com/autoviz-automatically-visualize-any-dataset-ba2691a8b55a
-#https://pypi.org/project/autoviz/
-#=======================================================================
-import os, sys
-import pandas as pd
-import numpy as np
-import re
-import argparse
-from autoviz.AutoViz_Class import AutoViz_Class
-
-homedir = os.path.expanduser('~')
-os.chdir(homedir + '/git/LSHTM_analysis/scripts')
-#%%============================================================================
-# variables
-gene = 'pncA'
-drug = 'pyrazinamide'
-
-#%%============================================================================
-#==============
-# directories
-#==============
-datadir = homedir + '/' + 'git/Data'
-    
-indir = datadir + '/' + drug + '/input'
-    
-outdir = datadir + '/' + drug + '/output'
-
-#=======
-# input
-#=======
-in_filename_plotting = 'car_design.csv'
-in_filename_plotting = gene.lower() + '_all_params.csv'
-infile_plotting = outdir + '/' + in_filename_plotting
-print('plotting file: ', infile_plotting
-      , '\n============================================================')
-#=======================================================================
-plotting_df = pd.read_csv(infile_plotting, sep = ',')
-#Instantiate the AutoViz class
-AV = AutoViz_Class()
-df = AV.AutoViz(infile_plotting)
-#df2 = AV.AutoViz(plotting_df)
-plotting_df.columns[~plotting_df.columns.isin(df.columns)]
diff --git a/scripts/plotting/combined_or.R b/scripts/plotting/combined_or.R
index 4bf2b80..1c167a8 100644
--- a/scripts/plotting/combined_or.R
+++ b/scripts/plotting/combined_or.R
@@ -1,4 +1,3 @@
-
 #!/usr/bin/env Rscript
 #########################################################
 # TASK: Basic lineage barplot showing numbers
diff --git a/scripts/plotting/lineage_count.txt b/scripts/plotting/lineage_count.txt
deleted file mode 100644
index fe6a87d..0000000
--- a/scripts/plotting/lineage_count.txt
+++ /dev/null
@@ -1,71 +0,0 @@
-#=============
-# merged_df2
-#=============
-----------------
-# no. of samples
-----------------
-                  Var1 Freq
-1                         8
-2             lineage1  144
-3    lineage1;lineage2    3
-4    lineage1;lineage4    4
-5             lineage2 1886
-6    lineage2;lineage4   19
-7             lineage3  190
-8    lineage3;lineage4   11
-9             lineage4 2213
-10   lineage4;lineage6    1
-11   lineage4;lineage7    1
-12 lineage4;lineageBOV    1
-13            lineage5   31
-14            lineage6    9
-15            lineage7    3
-16          lineageBOV  392
-
-----------------
-# no. of nsSNPs
-----------------
-
-  sel_lineages num_snps_u total_samples
-1     lineage1         74           144
-2     lineage2        277          1886
-3     lineage3        104           190
-4     lineage4        311          2213
-5     lineage5         18            31
-6     lineage6          8             9
-7     lineage7          1             3
-
-
-#=============
-# merged_df2_comp
-#=============
-----------------
-# no. of samples
-----------------
-
-                  Var1 Freq
-1                         3
-2             lineage1  108
-3    lineage1;lineage2    2
-4    lineage1;lineage4    2
-5             lineage2 1497
-6    lineage2;lineage4   13
-7             lineage3  154
-8    lineage3;lineage4    3
-9             lineage4 1846
-10 lineage4;lineageBOV    1
-11            lineage5   12
-12            lineage6    2
-13          lineageBOV  36
-
-----------------
-# no. of nsSNPs
-----------------
-  sel_lineages num_snps_u total_samples
-1     lineage1         42           108
-2     lineage2        141          1497
-3     lineage3         75           154
-4     lineage4        148          1846
-5     lineage5          9            12
-6     lineage6          2             2
-7     lineage7          0             0

From 5d9561f88a3a564a05f79ef1e767a79b08da58eb Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Mon, 7 Sep 2020 17:17:56 +0100
Subject: [PATCH 09/27] trying other num param plots

---
 scripts/plotting/test.R | 96 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 96 insertions(+)
 create mode 100644 scripts/plotting/test.R

diff --git a/scripts/plotting/test.R b/scripts/plotting/test.R
new file mode 100644
index 0000000..8033517
--- /dev/null
+++ b/scripts/plotting/test.R
@@ -0,0 +1,96 @@
+setwd("/home/tanu/git/LSHTM_analysis/scripts/plotting")
+
+source("combining_dfs_plotting.R")
+
+table(merged_df3$mutation_info)
+
+# assign foldx
+#ddg<0 = "Stabilising" (-ve)
+table(merged_df3$ddg < 0)
+merged_df3$foldx_outcome = ifelse(merged_df3$ddg < 0, "Stabilising", "Destabilising")
+
+#===========
+# PS data
+#===========
+dr_muts = merged_df3[merged_df3$mutation_info == "dr_mutations_pyrazinamide",]
+other_muts = merged_df3[merged_df3$mutation_info == "other_mutations_pyrazinamide",]
+
+par(mfrow = c(1,1))
+par(mfrow = c(2,6))
+
+# mcsm duet
+boxplot(dr_muts$duet_scaled, other_muts$duet_scaled, main = "DUET"
+        #, col = factor(merged_df3$duet_outcome)
+        )
+wilcox.test(dr_muts$duet_scaled, other_muts$duet_scaled, paired = F)
+
+# foldx ddg
+boxplot(dr_muts$ddg, other_muts$ddg, main = "Foldx")
+wilcox.test(dr_muts$ddg, other_muts$ddg, paired = F)
+
+# rd
+boxplot(dr_muts$rd_values, other_muts$rd_values, main = "RD")
+wilcox.test(dr_muts$rd_values, other_muts$rd_values)
+
+# kd
+boxplot(dr_muts$kd_values, other_muts$kd_values, main = "KD")
+wilcox.test(dr_muts$kd_values, other_muts$kd_values)
+
+# asa
+boxplot(dr_muts$asa, other_muts$asa, main = "ASA")
+wilcox.test(dr_muts$asa, other_muts$asa)
+
+# rsa
+boxplot(dr_muts$rsa, other_muts$rsa, main = "RSA")
+wilcox.test(dr_muts$rsa, other_muts$rsa)
+
+#===================================================================
+#==========
+# LIG data
+#==========
+dr_muts_lig = merged_df3_lig[merged_df3_lig$mutation_info == "dr_mutations_pyrazinamide",]
+other_muts_lig = merged_df3_lig[merged_df3_lig$mutation_info == "other_mutations_pyrazinamide",]
+
+# mcsm ligand affinity
+boxplot(dr_muts_lig$duet_scaled, other_muts_lig$duet_scaled, main = "Ligand affinity")
+wilcox.test(dr_muts_lig$duet_scaled, other_muts_lig$duet_scaled, paired = F)
+
+# rd
+boxplot(dr_muts_lig$rd_values, other_muts_lig$rd_values, main = "RD")
+wilcox.test(dr_muts_lig$rd_values, other_muts_lig$rd_values)
+
+# kd
+boxplot(dr_muts_lig$kd_values, other_muts_lig$kd_values, main = "KD")
+wilcox.test(dr_muts_lig$kd_values, other_muts_lig$kd_values)
+
+# asa
+boxplot(dr_muts_lig$asa, other_muts_lig$asa, main = "ASA")
+wilcox.test(dr_muts_lig$asa, other_muts_lig$asa)
+
+# rsa
+boxplot(dr_muts_lig$rsa, other_muts_lig$rsa, main = "RSA")
+wilcox.test(dr_muts_lig$rsa, other_muts_lig$rsa)
+
+# checking agreement b/w mcsm and foldx
+cols_to_select = c("mutationinformation"
+                   , "mutation_info"
+                   , "duet_scaled"
+                   , "ddg"
+                   , "duet_outcome"
+                   , "foldx_outcome")
+
+merged_df3_short = select(merged_df3, cols_to_select)
+
+mcsm_foldx = merged_df3_short[which(merged_df3_short$duet_outcome != merged_df3_short$foldx_outcome),]
+
+mcsm_foldx$sign_comp = ifelse(sign(mcsm_foldx$duet_scaled)==sign(mcsm_foldx$ddg), "PASS", "FAIL")
+table(mcsm_foldx$sign_comp)
+
+# another way of checking
+merged_df3$sign_comp = ifelse(sign(merged_df3$duet_scaled)==sign(merged_df3$ddg), "PASS", "FAIL")
+table(merged_df3$sign_comp)
+
+disagreement = table(merged_df3$sign_comp)[2]/nrow(merged_df3)*100
+agreement = 100 - disagreement
+
+cat("There is", agreement, "% between mcsm and foldx predictions")

From fe49a45447c6fcac5f515b90203d48d5973bf332 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Tue, 8 Sep 2020 17:13:02 +0100
Subject: [PATCH 10/27] various changes

---
 scripts/combining_dfs.py                  | 224 +++++++++++++++-------
 scripts/data_extraction.py                |  41 ++--
 scripts/plotting/combining_dfs_plotting.R |  29 ++-
 3 files changed, 199 insertions(+), 95 deletions(-)

diff --git a/scripts/combining_dfs.py b/scripts/combining_dfs.py
index 622e8e5..be62177 100755
--- a/scripts/combining_dfs.py
+++ b/scripts/combining_dfs.py
@@ -55,6 +55,7 @@ os.getcwd()
 #from combining import combine_dfs_with_checks
 from combining_FIXME import detect_common_cols
 from reference_dict import oneletter_aa_dict # CHECK DIR STRUC THERE!
+from reference_dict import low_3letter_dict # CHECK DIR STRUC THERE!
 #=======================================================================
 #%% command line args
 arg_parser = argparse.ArgumentParser()
@@ -79,6 +80,21 @@ gene    = args.gene
 datadir = args.datadir
 indir   = args.input_dir
 outdir  = args.output_dir
+
+gene_match = gene + '_p.'
+print('mut pattern for gene', gene, ':',  gene_match)
+
+nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}'
+print('nsSNP for gene', gene, ':',  nssnp_match)
+
+wt_regex = gene_match.lower()+'([A-Za-z]{3})'
+print('wt regex:', wt_regex)
+
+mut_regex = r'[0-9]+(\w{3})$'
+print('mt regex:', mut_regex)
+
+pos_regex = r'([0-9]+)'
+print('position regex:', pos_regex)
 #%%=======================================================================
 #==============
 # directories
@@ -214,7 +230,9 @@ combined_df[merging_cols_m4].apply(len) == len(combined_df)
 
 # OR merges: TEDIOUSSSS!!!!
 
-
+del(mcsm_df, foldx_df, mcsm_foldx_dfs, dssp_kd_dfs, dssp_kd_rd_dfs,rd_df, kd_df, infile_mcsm, infile_foldx, infile_dssp, infile_kd)
+del(merging_cols_m1, merging_cols_m2, merging_cols_m3, merging_cols_m4)
+del(in_filename_dssp, in_filename_foldx, in_filename_kd, in_filename_mcsm, in_filename_rd)
 #%%
 print('==================================='
       , '\nFifth merge: afor_df + afor_kin_df'
@@ -235,7 +253,7 @@ print('Dim of afor_df:', afor_df.shape
 # finding if ALL afor_kin_df muts are present in afor_df
 # i.e all kinship muts should be PRESENT in mycalcs_present
 if len(afor_kin_df[afor_kin_df['mutation'].isin(afor_df['mutation'])]) == afor_kin_df.shape[0]:
-    print('PASS: ALL or_kinship muts are present in my or list')
+    print('PASS: ALL', len(afor_kin_df), 'or_kinship muts are present in my or list')
 else:
     nf_muts = len(afor_kin_df[~afor_kin_df['mutation'].isin(afor_df['mutation'])])
     nf_muts_df = afor_kin_df[~afor_kin_df['mutation'].isin(afor_df['mutation'])]    
@@ -246,10 +264,10 @@ else:
 
 # Now checking how many afor_df muts are NOT present in afor_kin_df    
 common_muts = len(afor_df[afor_df['mutation'].isin(afor_kin_df['mutation'])])  
-extra_muts_myor =   afor_kin_df.shape[0] - common_muts
+extra_muts_myor = afor_kin_df.shape[0] - common_muts
 
 print('=========================================='
-      , '\nmy or calcs has', extra_muts_myor, 'extra mutations'
+      , '\nmy or calcs has', common_muts, 'present in af_or_kin_df'
       , '\n==========================================')
 
 print('Expected cals for merging with outer_join...')
@@ -257,10 +275,15 @@ print('Expected cals for merging with outer_join...')
 expected_rows = afor_df.shape[0] + extra_muts_myor
 expected_cols = afor_df.shape[1] + afor_kin_df.shape[1] - len(merging_cols_m5)
 
+
+afor_df['mutation']
+afor_kin_df['mutation']
+
 ors_df = pd.merge(afor_df, afor_kin_df, on = merging_cols_m5, how = o_join)
 
 if ors_df.shape[0] == expected_rows and ors_df.shape[1] == expected_cols:
-    print('PASS: OR dfs successfully combined! PHEWWWW!')
+    print('PASS but with duplicate muts: OR dfs successfully combined! PHEWWWW!'
+          , '\nDuplicate muts present but with different \'ref\' and \'alt\' alleles')
 else:
     print('FAIL: could not combine OR dfs'
           , '\nCheck expected rows and cols calculation and join type')
@@ -269,12 +292,12 @@ print('Dim of merged ors_df:', ors_df.shape)
 
 ors_df[merging_cols_m5].apply(len)
 ors_df[merging_cols_m5].apply(len) == len(ors_df)
+
 #%%============================================================================
 # formatting ors_df
 ors_df.columns
 
 # Dropping unncessary columns: already removed in ealier preprocessing
-#cols_to_drop = ['reference_allele', 'alternate_allele', 'symbol' ]
 cols_to_drop = ['n_miss']
 print('Dropping', len(cols_to_drop), 'columns:\n'
       , cols_to_drop)
@@ -327,7 +350,6 @@ column_order = ['mutation'
                 #, 'wt_3let' # old
                 #, 'mt_3let' # old
                 #, 'symbol'
-                #, 'n_miss'
                 ]
 
 if ( (len(column_order) == ors_df.shape[1]) and (DataFrame(column_order).isin(ors_df.columns).all().all())):
@@ -343,6 +365,61 @@ else:
 
 print('\nResult of Sixth merge:', ors_df_ordered.shape
       , '\n===================================================================')
+#%%
+ors_df_ordered.shape
+check = ors_df_ordered[['mutationinformation','mutation', 'wild_type', 'position', 'mutant_type']]
+
+# populating 'nan' info
+lookup_dict = dict()
+for k, v in low_3letter_dict.items():
+    lookup_dict[k] = v['one_letter_code']
+    #print(lookup_dict)
+    
+    wt = ors_df_ordered['mutation'].str.extract(wt_regex).squeeze()
+    #print(wt)
+    ors_df_ordered['wild_type'] = wt.map(lookup_dict)
+    
+    ors_df_ordered['position'] = ors_df_ordered['mutation'].str.extract(pos_regex) 
+    
+    mt = ors_df_ordered['mutation'].str.extract(mut_regex).squeeze()
+    ors_df_ordered['mutant_type'] = mt.map(lookup_dict)
+
+ors_df_ordered['mutationinformation'] = ors_df_ordered['wild_type'] + ors_df_ordered.position.map(str) + ors_df_ordered['mutant_type']
+check = ors_df_ordered[['mutationinformation','mutation', 'wild_type', 'position', 'mutant_type']]
+
+# populate mut_info_f1
+ors_df_ordered['mut_info_f1'].isna().sum()
+ors_df_ordered['mut_info_f1'] = ors_df_ordered['position'].astype(str) + ors_df_ordered['wild_type'] + '>' + ors_df_ordered['position'].astype(str) + ors_df_ordered['mutant_type']
+ors_df_ordered['mut_info_f1'].isna().sum()  
+
+# populate mut_info_f2
+ors_df_ordered['mut_info_f2'] = ors_df_ordered['mutation'].str.replace(gene_match.lower(), 'p.', regex = True)
+
+# populate mut_type
+ors_df_ordered['mut_type'].isna().sum()
+#mut_type_word = ors_df_ordered['mut_type'].value_counts()
+mut_type_word  = 'missense' # FIXME, should be derived
+ors_df_ordered['mut_type'].fillna(mut_type_word, inplace = True)
+ors_df_ordered['mut_type'].isna().sum()
+
+# populate gene_id
+ors_df_ordered['gene_id'].isna().sum()
+#gene_id_word = ors_df_ordered['gene_id'].value_counts()
+gene_id_word  = 'Rv2043c' # FIXME, should be derived
+ors_df_ordered['gene_id'].fillna(gene_id_word, inplace = True)
+ors_df_ordered['gene_id'].isna().sum()
+
+# populate gene_name
+ors_df_ordered['gene_name'].isna().sum()
+ors_df_ordered['gene_name'].value_counts()
+ors_df_ordered['gene_name'].fillna(gene, inplace = True)
+ors_df_ordered['gene_name'].isna().sum()
+
+# check numbers
+ors_df_ordered['or_kin'].isna().sum()
+# should be 0
+ors_df_ordered['or_mychisq'].isna().sum()
+
 #%%============================================================================ 
 print('==================================='
       , '\nSixth merge: Fourth + Fifth merge'
@@ -350,79 +427,94 @@ print('==================================='
       , '\n===================================')
 
 #combined_df_all = combine_dfs_with_checks(combined_df, ors_df_ordered, my_join = i_join)
-merging_cols_m6 = detect_common_cols(combined_df, ors_df_ordered)
+merging_cols_m6 = detect_common_cols(combined_df, ors_df_ordered) 
+
+# dtype problems
+if len(merging_cols_m6) > 1 and 'position'in merging_cols_m6:
+    print('Removing \'position\' from merging_cols_m6 to make dtypes consistent'
+          , '\norig length of merging_cols_m6:', len(merging_cols_m6))
+    merging_cols_m6.remove('position')
+    print('\nlength after removing:', len(merging_cols_m6))
+
 print('Dim of df1:', combined_df.shape
       , '\nDim of df2:', ors_df_ordered.shape
       , '\nNo. of merging_cols:', len(merging_cols_m6))
 
 print('Checking mutations in the two dfs:'
-      , '\nmuts in df1 but NOT in df2:'
+      , '\nmuts in df1 present in df2:'
       , combined_df['mutationinformation'].isin(ors_df_ordered['mutationinformation']).sum()
-      , '\nmuts in df2 but NOT in df1:'
+      , '\nmuts in df2 present in df1:'
       , ors_df_ordered['mutationinformation'].isin(combined_df['mutationinformation']).sum())
 
-#print('\nNo. of common muts:', np.intersect1d(combined_df['mutationinformation'], ors_df_ordered['mutationinformation']) )
-
-combined_df_all = pd.merge(combined_df, ors_df, on = merging_cols_m6, how = l_join)
+#----------
+# merge 6
+#----------
+combined_df_all = pd.merge(combined_df, ors_df_ordered, on = merging_cols_m6, how = o_join)
 combined_df_all.shape
 
-# populate mut_info_f1
-combined_df_all['mut_info_f1'].isna().sum()
-combined_df_all['mut_info_f1'] = combined_df_all['position'].astype(str) + combined_df_all['wild_type'] + '>' + combined_df_all['position'].astype(str) + combined_df_all['mutant_type']
-combined_df_all['mut_info_f1'].isna().sum()  
-
-# populate mut_type
-combined_df_all['mut_type'].isna().sum()
-#mut_type_word = combined_df_all['mut_type'].value_counts()
-mut_type_word  = 'missense' # FIXME, should be derived
-combined_df_all['mut_type'].fillna(mut_type_word, inplace = True)
-combined_df_all['mut_type'].isna().sum()
-
-# populate gene_id
-combined_df_all['gene_id'].isna().sum()
-#gene_id_word = combined_df_all['gene_id'].value_counts()
-gene_id_word  = 'Rv2043c' # FIXME, should be derived
-combined_df_all['gene_id'].fillna(gene_id_word, inplace = True)
-combined_df_all['gene_id'].isna().sum()
-
-# populate gene_name
-combined_df_all['gene_name'].isna().sum()
-combined_df_all['gene_name'].value_counts()
-combined_df_all['gene_name'].fillna(gene, inplace = True)
-combined_df_all['gene_name'].isna().sum()
-
-
-# FIXME: DIM
-# only with left join!
-outdf_expected_rows = len(combined_df)
+# sanity check for merge 6
+outdf_expected_rows = len(combined_df) + extra_muts_myor
+unique_muts = len(combined_df)
 outdf_expected_cols = len(combined_df.columns) + len(ors_df_ordered.columns) - len(merging_cols_m6)
 
-#if combined_df_all.shape[1] == outdf_expected_cols and combined_df_all.shape[0] == outdf_expected_rows:  
-if combined_df_all.shape[1] == outdf_expected_cols and combined_df_all['mutationinformation'].nunique() == outdf_expected_rows:
+if combined_df_all.shape[0] == outdf_expected_rows and combined_df_all.shape[1] == outdf_expected_cols and combined_df_all['mutationinformation'].nunique() == unique_muts:
     print('PASS: Df dimension match'
-          , '\nDim of combined_df_all with join type:', l_join
+          , '\ncombined_df_all with join type:', o_join
           , '\n', combined_df_all.shape
           , '\n===============================================================')
 else:
     print('FAIL: Df dimension mismatch'
              , 'Cannot generate expected dim. See details of merge performed'
              , '\ndf1 dim:', combined_df.shape
-             , '\ndf2 dim:', ors_df.shape
+             , '\ndf2 dim:', ors_df_ordered.shape
              , '\nGot:', combined_df_all.shape
              , '\nmuts in df1 but NOT in df2:'
-             , combined_df['mutationinformation'].isin(ors_df['mutationinformation']).sum()
+             , combined_df['mutationinformation'].isin(ors_df_ordered['mutationinformation']).sum()
              , '\nmuts in df2 but NOT in df1:'
              , ors_df['mutationinformation'].isin(combined_df['mutationinformation']).sum())
     sys.exit()
-    
+ 
+# drop extra cols
+all_cols = combined_df_all.columns
+
+#pos_cols_check = combined_df_all[['position_x','position_y']]
+c = combined_df_all[['position_x','position_y']].isna().sum()
+pos_col_to_drop = c.index[c>0].to_list()
+cols_to_drop = pos_col_to_drop +  ['wild_type_kd']   
+
+print('Dropping', len(cols_to_drop), 'columns:\n', cols_to_drop)
+combined_df_all.drop(cols_to_drop, axis = 1, inplace = True) 
+ 
+# rename position_x to position
+pos_col_to_rename = c.index[c==0].to_list()
+combined_df_all.shape
+combined_df_all.rename(columns = { pos_col_to_rename[0]: 'position'}, inplace = True)
+combined_df_all.shape
+
+all_cols = combined_df_all.columns
+
+#%% reorder cols to for convenience
+first_cols = ['mutationinformation','mutation', 'wild_type', 'position', 'mutant_type']
+last_cols = [col for col in combined_df_all.columns if col not in first_cols]
+
+df = combined_df_all[first_cols+last_cols]
 
 #%% IMPORTANT: check if mutation related info is all populated after this merge  
-# FIXME: should get fixed with JP's resolved dataset!?
-check_nan = combined_df_all.isna().sum(axis = 0)
+# select string colnames to ensure no NA exist there
+string_cols = combined_df_all.columns[combined_df_all.applymap(lambda x: isinstance(x, str)).all(0)]
+
+if (combined_df_all[string_cols].isna().sum(axis = 0)== 0).all():
+    print('PASS: All string cols are populated with no NAs')
+else:
+    print('FAIL: NAs detected in string cols')
+    print(combined_df_all[string_cols].isna().sum(axis = 0))
+    sys.exit()
+
 # relevant mut cols
 check_mut_cols = merging_cols_m5 + merging_cols_m6
 
 count_na_mut_cols = combined_df_all[check_mut_cols].isna().sum().reset_index().rename(columns = {'index': 'col_name', 0: 'na_count'})
+print(check_mut_cols)
 
 if (count_na_mut_cols['na_count'].sum() > 0).any():
     # FIXME: static override, generate 'mutation' from variable
@@ -434,31 +526,29 @@ else:
     print('No missing \'mutation\' has been detected!')
     
     
-lookup_dict = dict()
-for k, v in oneletter_aa_dict.items():
-    lookup_dict[k] = v['three_letter_code_lower']
-    print(lookup_dict)
-    wt_3let = combined_df_all['wild_type'].map(lookup_dict)
-    #print(wt_3let)
-    pos = combined_df_all['position'].astype(str)
-    #print(pos)
-    mt_3let = combined_df_all['mutant_type'].map(lookup_dict)
-    #print(mt_3let)
-    # override the 'mutation' column
-    combined_df_all['mutation'] = 'pnca_p.' + wt_3let + pos + mt_3let
-    print(combined_df_all['mutation'])    
 
-# populate mut_info_f2
-combined_df_all['mut_info_f2'] = combined_df_all['mutation'].str.replace(gene_match.lower(), 'p.', regex = True)
+#cols_to_drop = ['reference_allele', 'alternate_allele', 'symbol' ]
+col_to_drop = ['wild_type_kd']
+print('Dropping', len(col_to_drop), 'columns:\n'
+      , col_to_drop)
+combined_df_all.drop(col_to_drop, axis = 1, inplace = True)
+
+
 
 #%% check
 #cols_check = check_mut_cols + ['mut_info_f1', 'mut_info_f2']
 #foo = combined_df_all[cols_check]
-
+foo = combined_df_all.drop_duplicates('mutationinformation')
+foo2 = combined_df_all.drop_duplicates('mutation')
+poo = combined_df_all[combined_df_all['mutation'].isna()]
 #%%============================================================================
 output_cols = combined_df_all.columns
 #print('Output cols:', output_cols)
-
+#%% drop duplicates
+if combined_df_all.shape[0] != outdf_expected_rows:
+    print('INFORMARIONAL ONLY: combined_df_all has duplicate muts present but with unique ref and alt allele')
+else:
+    print('combined_df_all has no duplicate muts present')
 #%%============================================================================ 
 # write csv
 print('Writing file: combined output of all params needed for plotting and ML')
diff --git a/scripts/data_extraction.py b/scripts/data_extraction.py
index 3f72b1b..f16e136 100755
--- a/scripts/data_extraction.py
+++ b/scripts/data_extraction.py
@@ -81,16 +81,16 @@ gene = args.gene
 gene_match = gene + '_p.'
 print('mut pattern for gene', gene, ':',  gene_match)
 
-nssnp_match = gene_match +'[A-Z]{3}[0-9]+[A-Z]{3}'
+nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}'
 print('nsSNP for gene', gene, ':',  nssnp_match)
 
-wt_regex = gene_match.lower()+'(\w{3})'
+wt_regex = gene_match.lower()+'([A-Za-z]{3})'
 print('wt regex:', wt_regex)
 
-mut_regex = r'\d+(\w{3})$'
+mut_regex = r'[0-9]+(\w{3})$'
 print('mt regex:', mut_regex)
 
-pos_regex = r'(\d+)'
+pos_regex = r'([0-9]+)'
 print('position regex:', pos_regex)
 
 # building cols to extract
@@ -154,30 +154,29 @@ if in_filename_master == 'original_tanushree_data_v2.csv':
 else:
     core_cols = ['id'
                  , 'sample'
-                 , 'patient_id'
-                 , 'strain'
+                 #, 'patient_id'
+                 #, 'strain'
                  , 'lineage'
                  , 'sublineage' 
-                 , 'country'
+                 #, 'country'
                  , 'country_code'
                  , 'geographic_source'
                  #, 'region'
-                 , 'location'
-                 , 'host_body_site'
-                 , 'environment_material'
-                 , 'host_status'
-                 , 'host_sex'
-                 , 'submitted_host_sex'
-                 , 'hiv_status'
-                 , 'HIV_status'
-                 , 'tissue_type'
-                 , 'isolation_source'
+                 #, 'location'
+                 #, 'host_body_site'
+                 #, 'environment_material'
+                 #, 'host_status'
+                 #, 'host_sex'
+                 #, 'submitted_host_sex'
+                 #, 'hiv_status'
+                 #, 'HIV_status'
+                 #, 'tissue_type'
+                 #, 'isolation_source'
                  ,  resistance_col]
     
     variable_based_cols = [drug
                            , dr_muts_col
                            , other_muts_col]
-                           #, resistance_col]
        
     cols_to_extract = core_cols + variable_based_cols
     print('Extracting', len(cols_to_extract), 'columns from master data')
@@ -200,7 +199,7 @@ print('No. of NAs/column:' + '\n', meta_data.isna().sum()
 
 #%% Write check file
 check_file = outdir + '/' + gene.lower() + '_gwas.csv'
-meta_data.to_csv(check_file)
+meta_data.to_csv(check_file, index = False)
 print('Writing subsetted gwas data'
       , '\nFile', check_file
       , '\nDim:', meta_data.shape)
@@ -215,9 +214,9 @@ print('Writing subsetted gwas data'
 # drug counts: complete samples for OR calcs
 meta_data[drug].value_counts() 
 print('===========================================================\n'
-      , 'RESULT: No. of Sus and Res samples:\n', meta_data[drug].value_counts()
+      , 'RESULT: No. of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts()
 	  , '\n===========================================================\n'
-      , 'RESULT: Percentage of Sus and Res samples:\n', meta_data[drug].value_counts(normalize = True)*100
+      , 'RESULT: Percentage of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts(normalize = True)*100
 		, '\n===========================================================')
 
 #%%
diff --git a/scripts/plotting/combining_dfs_plotting.R b/scripts/plotting/combining_dfs_plotting.R
index 7d45d1b..38474d9 100644
--- a/scripts/plotting/combining_dfs_plotting.R
+++ b/scripts/plotting/combining_dfs_plotting.R
@@ -58,6 +58,20 @@ rm(my_df, upos, dup_muts)
 #in_file1: output of plotting_data.R
 # my_df_u
 
+# quick checks
+head(my_df_u[, c("mutation", "mutation2")])
+
+cols_to_extract  = c("mutationinformation", "mutation", "or_mychisq", "or_kin", "af", "af_kin")
+foo = my_df_u[, colnames(my_df_u)%in% cols_to_extract]
+
+
+which(is.na(my_df_u$af_kin)) == which(is.na(my_df_u$af))
+
+
+baz = cbind(my_df_u$mutation, my_df_u$or_mychisq, bar$mutation, bar$or_mychisq)
+colnames(baz) = c("my_df_u_muts", "my_df_u_or", "real_muts", "real_or")
+
+
 # infile 2: gene associated meta data
 #in_filename_gene_metadata = paste0(tolower(gene),  "_meta_data_with_AFandOR.csv")
 in_filename_gene_metadata = paste0(tolower(gene),  "_metadata.csv")
@@ -94,6 +108,7 @@ gene_metadata <- read.csv(infile_gene_metadata
                       , header = T)
 cat("Dim:", dim(gene_metadata))
 
+
 # counting NAs in AF, OR cols: 
 if (identical(sum(is.na(my_df_u$or_mychisq))
               , sum(is.na(my_df_u$pval_fisher))
@@ -230,9 +245,9 @@ if (identical(sum(is.na(merged_df3$or_kin))
 if ( identical( which(is.na(merged_df2$or_mychisq)), which(is.na(merged_df2$or_kin)))
   && identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin)))
   && identical( which(is.na(merged_df2$pval_fisher)), which(is.na(merged_df2$pwald_kin))) ){
-  cat('PASS: Indices match for mychisq and kin ors missing values')
+  cat("PASS: Indices match for mychisq and kin ors missing values")
 } else{
-  cat('Index mismatch: mychisq and kin ors missing indices match')
+  cat("Index mismatch: mychisq and kin ors missing indices match")
   quit()
 }
 
@@ -245,7 +260,7 @@ cat("Merging dfs without any NAs: big df (1-many relationship b/w id & mut)"
     ,"\nfilename: merged_df2_comp")
 
 if ( identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin))) ){
-  print('mychisq and kin ors missing indices match. Procedding with omitting NAs')
+  print("mychisq and kin ors missing indices match. Procedding with omitting NAs")
   na_count_df2 = sum(is.na(merged_df2$af))
   merged_df2_comp = merged_df2[!is.na(merged_df2$af),] 
   # sanity check: no +-1 gymnastics
@@ -262,7 +277,7 @@ if ( identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin))) ){
         ,"\nGot no. of rows: ", nrow(merged_df2_comp))
   }
 }else{
-  print('Index mismatch for mychisq and kin ors. Aborting NA ommission')
+  print("Index mismatch for mychisq and kin ors. Aborting NA ommission")
 }
 
 #=========================
@@ -272,7 +287,7 @@ if ( identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin))) ){
 #=========================
 
 if ( identical( which(is.na(merged_df3$af)), which(is.na(merged_df3$af_kin))) ){
-  print('mychisq and kin ors missing indices match. Procedding with omitting NAs')
+  print("mychisq and kin ors missing indices match. Procedding with omitting NAs")
   na_count_df3 = sum(is.na(merged_df3$af))
   #merged_df3_comp = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] # a way
   merged_df3_comp = merged_df3[!is.na(merged_df3$af),] # another way
@@ -289,7 +304,7 @@ if ( identical( which(is.na(merged_df3$af)), which(is.na(merged_df3$af_kin))) ){
         ,"\nGot no. of rows: ", nrow(merged_df3_comp))
    }
 } else{
-  print('Index mismatch for mychisq and kin ors. Aborting NA ommission')
+  print("Index mismatch for mychisq and kin ors. Aborting NA ommission")
 } 
   
 # alternate way of deriving merged_df3_comp
@@ -347,7 +362,7 @@ merged_df3_comp_lig = merged_df3_comp[merged_df3_comp$ligand_distance<10,]
 if (nrow(merged_df3_lig) == nrow(my_df_u_lig)){
   print("PASS: verified merged_df3_lig")
 }else{
-  cat(paste0('FAIL: nrow mismatch for merged_df3_lig'
+  cat(paste0("FAIL: nrow mismatch for merged_df3_lig"
              , "\nExpected:", nrow(my_df_u_lig)
              , "\nGot:", nrow(merged_df3_lig)))
 }

From 42986bb119da58c74f67c4175c2164502ecaa3ac Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Tue, 8 Sep 2020 17:46:52 +0100
Subject: [PATCH 11/27] hopefully finally sorted data merges!

---
 scripts/combining_dfs.py | 78 +++++++++++++++++++++++-----------------
 1 file changed, 46 insertions(+), 32 deletions(-)

diff --git a/scripts/combining_dfs.py b/scripts/combining_dfs.py
index be62177..eab2808 100755
--- a/scripts/combining_dfs.py
+++ b/scripts/combining_dfs.py
@@ -73,7 +73,6 @@ args = arg_parser.parse_args()
 #%% variable assignment: input and output 
 #drug = 'pyrazinamide'
 #gene = 'pncA'
-gene_match = gene + '_p.'
 
 drug    = args.drug
 gene    = args.gene
@@ -312,18 +311,13 @@ column_order = ['mutation'
                 , 'wild_type'               
                 , 'position'
                 , 'mutant_type'                              
-                #, 'chr_num_allele' #old
                 , 'ref_allele'
                 , 'alt_allele'
                 , 'mut_info_f1'
                 , 'mut_info_f2'
                 , 'mut_type'
                 , 'gene_id'
-                #, 'gene_number' #old
                 , 'gene_name'
-                #, 'mut_region'                
-                #, 'reference_allele'
-                #, 'alternate_allele'
                 , 'chromosome_number'                
                 , 'af'
                 , 'af_kin'               
@@ -346,11 +340,7 @@ column_order = ['mutation'
                 , 'se_kin'                            
                 , 'zval_logistic'
                 , 'logl_h1_kin'
-                , 'l_remle_kin'
-                #, 'wt_3let' # old
-                #, 'mt_3let' # old
-                #, 'symbol'
-                ]
+                , 'l_remle_kin']
 
 if ( (len(column_order) == ors_df.shape[1]) and (DataFrame(column_order).isin(ors_df.columns).all().all())):
     print('PASS: Column order generated for all:', len(column_order), 'columns'
@@ -516,39 +506,63 @@ check_mut_cols = merging_cols_m5 + merging_cols_m6
 count_na_mut_cols = combined_df_all[check_mut_cols].isna().sum().reset_index().rename(columns = {'index': 'col_name', 0: 'na_count'})
 print(check_mut_cols)
 
-if (count_na_mut_cols['na_count'].sum() > 0).any():
-    # FIXME: static override, generate 'mutation' from variable
-    na_muts_n = combined_df_all['mutation'].isna().sum() 
-    #baz = combined_df_all[combined_df_all['mutation'].isna()]
-    print(na_muts_n, 'mutations have missing \'mutation\' info.'
+c2 = combined_df_all[check_mut_cols].isna().sum()
+missing_info_cols = c2.index[c2>0].to_list()
+
+if c2.sum()>0:
+    #na_muts_n = combined_df_all['mutation'].isna().sum() 
+    na_muts_n = combined_df_all[missing_info_cols].isna().sum() 
+    print(na_muts_n.values[0], 'mutations have missing \'mutation\' info.'
           , '\nFetching these from reference dict...')
 else:
     print('No missing \'mutation\' has been detected!')
     
-    
-
-#cols_to_drop = ['reference_allele', 'alternate_allele', 'symbol' ]
-col_to_drop = ['wild_type_kd']
-print('Dropping', len(col_to_drop), 'columns:\n'
-      , col_to_drop)
-combined_df_all.drop(col_to_drop, axis = 1, inplace = True)
-
+lookup_dict = dict()
+for k, v in oneletter_aa_dict.items():
+    lookup_dict[k] = v['three_letter_code_lower']
+    print(lookup_dict)
+    wt_3let = combined_df_all['wild_type'].map(lookup_dict)
+    #print(wt_3let)
+    pos = combined_df_all['position'].astype(str)
+    #print(pos)
+    mt_3let = combined_df_all['mutant_type'].map(lookup_dict)
+    #print(mt_3let)
+    # override the 'mutation' column
+    combined_df_all['mutation'] = 'pnca_p.' + wt_3let + pos + mt_3let
+    print(combined_df_all['mutation'])    
 
+# check again
+if combined_df_all[missing_info_cols].isna().sum().all() == 0: 
+    print('PASS: No mutations have missing \'mutation\' info.')
+else:
+    print('FAIL:', combined_df_all[missing_info_cols].isna().sum().values[0]
+          , '\nmutations have missing info STILL...')
+    sys.exit()
 
 #%% check
-#cols_check = check_mut_cols + ['mut_info_f1', 'mut_info_f2']
-#foo = combined_df_all[cols_check]
 foo = combined_df_all.drop_duplicates('mutationinformation')
 foo2 = combined_df_all.drop_duplicates('mutation')
-poo = combined_df_all[combined_df_all['mutation'].isna()]
+if foo.equals(foo2):
+    print('PASS: Dropping mutation or mutatationinformation has the same effect\n')
+else:
+    print('FAIL: Still problems in merged data')
+    sys.exit()
+
 #%%============================================================================
 output_cols = combined_df_all.columns
-#print('Output cols:', output_cols)
-#%% drop duplicates
-if combined_df_all.shape[0] != outdf_expected_rows:
-    print('INFORMARIONAL ONLY: combined_df_all has duplicate muts present but with unique ref and alt allele')
+
+#%% IMPORTANT result info
+if combined_df_all.shape[0] == outdf_expected_rows:
+    print('\nINFORMARIONAL ONLY: combined_df_all has duplicate muts present but with unique ref and alt allele'
+          , '\n=============================================================')
 else:
-    print('combined_df_all has no duplicate muts present')
+    print('combined_df_all has no duplicate muts present'
+          ,'\n===============================================================')
+    
+print('\nDim of combined_data:', combined_df_all.shape
+      , '\nNo. of unique mutations:', combined_df_all['mutationinformation'].nunique())
+    
+
 #%%============================================================================ 
 # write csv
 print('Writing file: combined output of all params needed for plotting and ML')

From eb5491aad9c2d60fbedb7eca6a74d3c0b00a0361 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Tue, 8 Sep 2020 17:52:45 +0100
Subject: [PATCH 12/27] outputting revised all params file

---
 scripts/combining_dfs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/combining_dfs.py b/scripts/combining_dfs.py
index eab2808..af8be26 100755
--- a/scripts/combining_dfs.py
+++ b/scripts/combining_dfs.py
@@ -487,7 +487,7 @@ all_cols = combined_df_all.columns
 first_cols = ['mutationinformation','mutation', 'wild_type', 'position', 'mutant_type']
 last_cols = [col for col in combined_df_all.columns if col not in first_cols]
 
-df = combined_df_all[first_cols+last_cols]
+combined_df_all = combined_df_all[first_cols+last_cols]
 
 #%% IMPORTANT: check if mutation related info is all populated after this merge  
 # select string colnames to ensure no NA exist there

From 46b43cf261ccf1766c48dac8b3bb27ac83916324 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Tue, 8 Sep 2020 18:51:03 +0100
Subject: [PATCH 13/27] changing category of ambiguous muts

---
 scripts/combining_dfs.py                  |  8 +++++
 scripts/plotting/combining_dfs_plotting.R | 39 ++++++++++++++++++++---
 scripts/plotting/plotting_data.R          | 12 +++++++
 3 files changed, 55 insertions(+), 4 deletions(-)

diff --git a/scripts/combining_dfs.py b/scripts/combining_dfs.py
index af8be26..8b8e556 100755
--- a/scripts/combining_dfs.py
+++ b/scripts/combining_dfs.py
@@ -552,6 +552,14 @@ else:
 output_cols = combined_df_all.columns
 
 #%% IMPORTANT result info
+if combined_df_all['or_mychisq'].isna().sum() == len(combined_df) - len(afor_df):
+    print('PASS: No. of NA in or_mychisq matches expected length'
+          , '\nNo. of with NA in or_mychisq:', combined_df_all['or_mychisq'].isna().sum() 
+          , '\nNo. of NA in or_kin:', combined_df_all['or_kin'].isna().sum())
+else:
+    print('FAIL: No. of NA in or_mychisq does not match expected length')
+
+
 if combined_df_all.shape[0] == outdf_expected_rows:
     print('\nINFORMARIONAL ONLY: combined_df_all has duplicate muts present but with unique ref and alt allele'
           , '\n=============================================================')
diff --git a/scripts/plotting/combining_dfs_plotting.R b/scripts/plotting/combining_dfs_plotting.R
index 38474d9..bba0e00 100644
--- a/scripts/plotting/combining_dfs_plotting.R
+++ b/scripts/plotting/combining_dfs_plotting.R
@@ -59,18 +59,23 @@ rm(my_df, upos, dup_muts)
 # my_df_u
 
 # quick checks
-head(my_df_u[, c("mutation", "mutation2")])
+head(my_df_u[, c("mutation")])
 
 cols_to_extract  = c("mutationinformation", "mutation", "or_mychisq", "or_kin", "af", "af_kin")
 foo = my_df_u[, colnames(my_df_u)%in% cols_to_extract]
 
 
-which(is.na(my_df_u$af_kin)) == which(is.na(my_df_u$af))
+table(which(is.na(my_df_u$af_kin)) == which(is.na(my_df_u$af)))
 
+baz = read.csv(file.choose())
 
 baz = cbind(my_df_u$mutation, my_df_u$or_mychisq, bar$mutation, bar$or_mychisq)
+baz = as.data.frame(baz)
 colnames(baz) = c("my_df_u_muts", "my_df_u_or", "real_muts", "real_or")
+sum(is.na(baz$my_df_u_or)) == sum(is.na(my_df_u$or_mychisq))
 
+cat("\nNo. of with NA in or_mychisq:", sum(is.na(my_df_u$or_mychisq))
+    ,"\nNo. of NA in or_kin:" , sum(is.na(my_df_u$or_kin)))
 
 # infile 2: gene associated meta data
 #in_filename_gene_metadata = paste0(tolower(gene),  "_meta_data_with_AFandOR.csv")
@@ -109,7 +114,8 @@ gene_metadata <- read.csv(infile_gene_metadata
 cat("Dim:", dim(gene_metadata))
 
 
-# counting NAs in AF, OR cols: 
+# counting NAs in AF, OR cols
+# or_mychisq
 if (identical(sum(is.na(my_df_u$or_mychisq))
               , sum(is.na(my_df_u$pval_fisher))
               , sum(is.na(my_df_u$af)))){
@@ -123,7 +129,7 @@ if (identical(sum(is.na(my_df_u$or_mychisq))
       , "\nNA in AF:", sum(is.na(my_df_u$af)))
 }
 
-
+# or kin
 if (identical(sum(is.na(my_df_u$or_kin))
               , sum(is.na(my_df_u$pwald_kin))
               , sum(is.na(my_df_u$af_kin)))){
@@ -139,6 +145,31 @@ if (identical(sum(is.na(my_df_u$or_kin))
 
 str(gene_metadata)
 
+# change category of ambiguos mutations
+table(gene_metadata$mutation_info)
+
+cols_to_extract2  = c("mutationinformation", "mutation", "mutation_info")
+foo2 = gene_metadata[, colnames(gene_metadata)%in% cols_to_extract2]
+
+dr_muts = foo2[foo2$mutation_info == dr_muts_col,]
+other_muts = foo2[foo2$mutation_info == other_muts_col,]
+
+common_muts = dr_muts[dr_muts$mutation%in%other_muts$mutation,]
+#write.csv(common_muts, 'common_muts.csv')
+
+# FIXME read properly
+# "ambiguous_mut_names.csv"
+#"pnca_p.gly108arg", "pnca_p.gly132ala", "pnca_p.val180phe"
+ambiguous_muts = read.csv(file.choose())
+ambiguous_muts_names = ambiguous_muts$mutation
+
+common_muts_all = gene_metadata[gene_metadata$mutation%in%ambiguous_muts_names,]
+
+gene_metadata2 = gene_metadata
+
+if (gene_metadata$mutation_info[gene_metadata$mutation%in%ambiguous_muts_names] == other_muts_col){
+  print('change me')
+}
 ###################################################################
 #                           combining: PS
 ###################################################################
diff --git a/scripts/plotting/plotting_data.R b/scripts/plotting/plotting_data.R
index 291579e..7cfe0d8 100755
--- a/scripts/plotting/plotting_data.R
+++ b/scripts/plotting/plotting_data.R
@@ -52,6 +52,18 @@ in_filename_params = paste0(tolower(gene), "_all_params.csv")
 infile_params = paste0(outdir, "/", in_filename_params)
 cat(paste0("Input file 1:", infile_params) )
 
+
+dr_muts_col = paste0('dr_mutations_', drug)
+dr_muts_col  = paste0('other_mutations_', drug)
+
+cat('Extracting columns based on variables:\n'
+      , drug
+      , '\n'
+      , dr_muts_col
+      , '\n'
+      , other_muts_col
+      , '\n===============================================================')
+
 #%%===============================================================
 ###########################
 # Read file: struct params

From b7c7ffc018cfacfa8ea9d9b14274f671d1a638fe Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Wed, 9 Sep 2020 11:26:13 +0100
Subject: [PATCH 14/27] `resolved ambiguous muts and generated clean output.
 Also seaprated dir.R

---
 scripts/data_extraction.py                |  2 +-
 scripts/plotting/combining_dfs_plotting.R | 39 +++++++++++++++++++++--
 scripts/plotting/opp_mcsm_muts.R          |  1 +
 scripts/plotting/plotting_data.R          | 26 ++++-----------
 scripts/plotting/running_plot_scripts     |  4 +++
 5 files changed, 49 insertions(+), 23 deletions(-)

diff --git a/scripts/data_extraction.py b/scripts/data_extraction.py
index f16e136..66f57e1 100755
--- a/scripts/data_extraction.py
+++ b/scripts/data_extraction.py
@@ -1170,7 +1170,7 @@ del(out_filename_metadata_poscounts)
 
 #%% Write file: gene_metadata (i.e gene_LF1)
 # where each row has UNIQUE mutations NOT unique sample ids
-out_filename_metadata = gene.lower() + '_metadata.csv'
+out_filename_metadata = gene.lower() + '_metadata_raw.csv'
 outfile_metadata = outdir + '/' + out_filename_metadata
 print('Writing file: LF formatted data'
       , '\nFile:', outfile_metadata
diff --git a/scripts/plotting/combining_dfs_plotting.R b/scripts/plotting/combining_dfs_plotting.R
index bba0e00..5f5d284 100644
--- a/scripts/plotting/combining_dfs_plotting.R
+++ b/scripts/plotting/combining_dfs_plotting.R
@@ -156,6 +156,7 @@ other_muts = foo2[foo2$mutation_info == other_muts_col,]
 
 common_muts = dr_muts[dr_muts$mutation%in%other_muts$mutation,]
 #write.csv(common_muts, 'common_muts.csv')
+rm(common_muts)
 
 # FIXME read properly
 # "ambiguous_mut_names.csv"
@@ -165,11 +166,45 @@ ambiguous_muts_names = ambiguous_muts$mutation
 
 common_muts_all = gene_metadata[gene_metadata$mutation%in%ambiguous_muts_names,]
 
-gene_metadata2 = gene_metadata
-
 if (gene_metadata$mutation_info[gene_metadata$mutation%in%ambiguous_muts_names] == other_muts_col){
   print('change me')
 }
+
+# make a copy
+gene_metadata2 = gene_metadata
+table(gene_metadata$mutation_info)
+count_check = as.data.frame(cbind(table(gene_metadata$mutationinformation, gene_metadata$mutation_info)))
+#count_check$checks = ifelse(count_check$dr_mutations_pyrazinamide&&count_check$other_mutations_pyrazinamide>0, "ambi", "pass")
+table(count_check$checks)
+
+
+poo = c("V180F", "G132A", "D49G")
+poo2 = count_check[rownames(count_check)%in%poo,]
+poo2[[dr_muts_col]]&& poo2[[other_muts_col]]>0
+poo2$checks = ifelse(poo2$checkspoo2[[dr_muts_col]]&& poo2[[other_muts_col]]>0, "ambi", "pass")
+
+# remove common_muts_all
+ids = gene_metadata$mutation%in%common_muts_all$mutation; table(ids)
+gene_metadata_unambiguous = gene_metadata2[!ids,]
+
+# sanity checks: should be true
+table(gene_metadata_unambiguous$mutation%in%common_muts_all$mutation)[[1]] == nrow(gene_metadata_unambiguous)
+nrow(gene_metadata_unambiguous) + nrow(common_muts_all) == nrow(gene_metadata)
+
+# correct common muts
+table(common_muts_all$mutation_info)
+common_muts_all$mutation_info = as.factor(common_muts_all$mutation_info)
+
+# change the other_muts to dr_muts
+common_muts_all$mutation_info[common_muts_all$mutation_info==other_muts_col] <- dr_muts_col
+
+table(common_muts_all$mutation_info)
+common_muts_all$mutation_info = factor(common_muts_all$mutation_info)
+table(common_muts_all$mutation_info)
+
+# add it back to 
+gene_meta_data
+
 ###################################################################
 #                           combining: PS
 ###################################################################
diff --git a/scripts/plotting/opp_mcsm_muts.R b/scripts/plotting/opp_mcsm_muts.R
index 5b02f34..1ca1f6c 100644
--- a/scripts/plotting/opp_mcsm_muts.R
+++ b/scripts/plotting/opp_mcsm_muts.R
@@ -39,6 +39,7 @@ rm(my_df, upos, dup_muts)
 #in_file1: output of plotting_data.R
 # my_df_u
 
+#===========
 # output
 #===========
 # mutations with opposite effects
diff --git a/scripts/plotting/plotting_data.R b/scripts/plotting/plotting_data.R
index 7cfe0d8..049f6c8 100755
--- a/scripts/plotting/plotting_data.R
+++ b/scripts/plotting/plotting_data.R
@@ -15,7 +15,10 @@ library(ggplot2)
 library(data.table)
 library(dplyr)
 
+source("dirs.R")
+
 require("getopt", quietly = TRUE) #cmd parse arguments
+
 #========================================================
 # command line args
 #spec = matrix(c(
@@ -31,19 +34,6 @@ require("getopt", quietly = TRUE) #cmd parse arguments
 #  stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)")
 #}
 #========================================================
-#%% variable assignment: input and output paths & filenames
-drug = "pyrazinamide"
-gene = "pncA"
-gene_match = paste0(gene,"_p.")
-cat(gene_match)
-
-#=============
-# directories
-#=============
-datadir = paste0("~/git/Data")
-indir = paste0(datadir, "/", drug, "/input")
-outdir = paste0("~/git/Data", "/", drug, "/output")
-plotdir = paste0("~/git/Data", "/", drug, "/output/plots")
 #======
 # input
 #======
@@ -53,15 +43,14 @@ infile_params = paste0(outdir, "/", in_filename_params)
 cat(paste0("Input file 1:", infile_params) )
 
 
-dr_muts_col = paste0('dr_mutations_', drug)
-dr_muts_col  = paste0('other_mutations_', drug)
-
-cat('Extracting columns based on variables:\n'
+cat('columns based on variables:\n'
       , drug
       , '\n'
       , dr_muts_col
       , '\n'
       , other_muts_col
+      , "\n"
+      , resistance_col
       , '\n===============================================================')
 
 #%%===============================================================
@@ -74,9 +63,6 @@ my_df = read.csv(infile_params, header = T)
 
 cat("\nInput dimensions:", dim(my_df)) 
 
-# quick checks
-#colnames(my_df)
-#str(my_df)
 
 ###########################
 # extract unique mutation entries
diff --git a/scripts/plotting/running_plot_scripts b/scripts/plotting/running_plot_scripts
index 4abdd9a..365fcb5 100644
--- a/scripts/plotting/running_plot_scripts
+++ b/scripts/plotting/running_plot_scripts
@@ -1,6 +1,10 @@
 #========
 # plotting
 #========
+FIXME:
+
+#dirs.R: imports dir structure
+change this to cmd so you can pass in drug and gene name
 
 #=======================
 plotting_data.R

From 09e4f7bfbd55c99976e57e0743dcea225e776e1e Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Wed, 9 Sep 2020 11:36:40 +0100
Subject: [PATCH 15/27] add dirs and resolving_ambiguous_muts

---
 scripts/plotting/dirs.R                     |  66 ++++++++++
 scripts/plotting/resolving_ambiguous_muts.R | 136 ++++++++++++++++++++
 2 files changed, 202 insertions(+)
 create mode 100644 scripts/plotting/dirs.R
 create mode 100644 scripts/plotting/resolving_ambiguous_muts.R

diff --git a/scripts/plotting/dirs.R b/scripts/plotting/dirs.R
new file mode 100644
index 0000000..789225f
--- /dev/null
+++ b/scripts/plotting/dirs.R
@@ -0,0 +1,66 @@
+#!/usr/bin/env Rscript             
+#########################################################
+# TASK: formatting data that will be used for various plots
+
+# useful links
+#https://stackoverflow.com/questions/38851592/r-append-column-in-a-dataframe-with-frequency-count-based-on-two-columns
+#########################################################
+# working dir and loading libraries
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/plotting")
+getwd()
+
+#source("Header_TT.R")
+library(ggplot2)
+library(data.table)
+library(dplyr)
+
+require("getopt", quietly = TRUE) #cmd parse arguments
+#========================================================
+# command line args
+#spec = matrix(c(
+#  "drug"   , "d", 1, "character",
+#  "gene"   , "g", 1, "character"
+#), byrow = TRUE, ncol = 4)
+
+#opt = getopt(spec)
+
+#drug = opt$druggene = opt$gene
+
+#if(is.null(drug)|is.null(gene)) {
+#  stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)")
+#}
+#========================================================
+# FIXME: change to cmd
+#%% variable assignment: input and output paths & filenames
+drug = "pyrazinamide"
+gene = "pncA"
+gene_match = paste0(gene,"_p.")
+cat(gene_match)
+
+#=============
+# directories and variables
+#=============
+datadir = paste0("~/git/Data")
+indir = paste0(datadir, "/", drug, "/input")
+outdir = paste0("~/git/Data", "/", drug, "/output")
+plotdir = paste0("~/git/Data", "/", drug, "/output/plots")
+
+dr_muts_col = paste0('dr_mutations_', drug)
+other_muts_col  = paste0('other_mutations_', drug)
+resistance_col = "drtype"
+
+cat('columns based on variables:\n'
+      , drug
+      , '\n'
+      , dr_muts_col
+      , '\n'
+      , other_muts_col
+      , "\n"
+      , resistance_col
+      , '\n===============================================================')
+
+#%%===============================================================
+
+
+
diff --git a/scripts/plotting/resolving_ambiguous_muts.R b/scripts/plotting/resolving_ambiguous_muts.R
new file mode 100644
index 0000000..8edb806
--- /dev/null
+++ b/scripts/plotting/resolving_ambiguous_muts.R
@@ -0,0 +1,136 @@
+#!/usr/bin/env Rscript
+#########################################################
+# TASK: To resolve ambiguous muts present in <gene>_metadata.csv
+#(which is one of the outputs from python script)
+
+# Input csv file: <gene>_metadata.csv
+
+# Output csv file: <gene>_metadata_clean.csv
+
+#########################################################
+#=======================================================================
+# working dir and loading libraries
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/plotting/")
+getwd()
+
+source("dirs.R")
+
+cat("Directories imported:"
+    , "\n===================="
+    , "\ndatadir:", datadir
+    , "\nindir:", indir
+    , "\noutdir:", outdir
+    , "\nplotdir:", plotdir)
+
+cat("Variables imported:"
+    , "\n====================="
+    , "\ndrug:", drug
+    , "\ngene:", gene
+    , "\ngene_match:", gene_match
+    , "\ndr_muts_col:", dr_muts_col
+    , "\nother_muts_col:", other_muts_col
+    , "\ndrtype_col:", resistance_col)
+
+#========================================================
+#===========
+# input
+#===========
+# infile1: gene associated meta data
+gene_metadata_raw = paste0(tolower(gene),  "_metadata_raw.csv")
+infile_gene_metadata_raw = paste0(outdir, "/", gene_metadata_raw)
+cat("Input infile 1:", infile_gene_metadata_raw)
+
+# infile 2: ambiguous muts
+ambiguous_muts = paste0(tolower(gene),  "_ambiguous_mut_names.csv")
+infile_ambiguous_muts = paste0(outdir, "/", ambiguous_muts)
+cat("Input infile 2:", infile_ambiguous_muts)
+
+#===========
+# output
+#===========
+# clean gene_metadat with ambiguous muts resolved
+gene_metadata_clean = paste0(tolower(gene), "_metadata.csv")
+outfile_gene_metadata_clean  = paste0(outdir, "/", gene_metadata_clean)
+cat("Output file:", outfile_gene_metadata_clean)
+
+#%%===============================================================
+
+###########################
+# Read file: <gene>_meta data raw.csv
+###########################
+cat("Reading meta data file:", infile_gene_metadata_raw)
+
+gene_metadata_raw <- read.csv(infile_gene_metadata_raw
+                      , stringsAsFactors = F
+                      , header = T)
+
+cat("Dim:", dim(gene_metadata_raw))
+
+###########################
+# Read file: ambiguous muts.csv
+##########################
+ambiguous_muts = read.csv(infile_ambiguous_muts)
+
+ambiguous_muts_names = ambiguous_muts$mutation
+
+common_muts_all = gene_metadata_raw[gene_metadata_raw$mutation%in%ambiguous_muts_names,]
+
+#==============
+# resolving ambiguous muts
+#===============
+table(gene_metadata_raw$mutation_info)
+count_check = as.data.frame(cbind(table(gene_metadata_raw$mutationinformation, gene_metadata_raw$mutation_info)))
+count_check$checks = ifelse(count_check[[dr_muts_col]]&count_check[[other_muts_col]]>0
+                            , "ambi", "pass")
+table(count_check$checks)
+
+cat(table(count_check$checks)[["ambi"]], "ambiguous muts detected. Correcting these")
+
+# remove all ambiguous muts from df
+ids = gene_metadata_raw$mutation%in%common_muts_all$mutation; table(ids)
+gene_metadata_raw_unambiguous = gene_metadata_raw[!ids,]
+
+# sanity checks: should be true
+table(gene_metadata_raw_unambiguous$mutation%in%common_muts_all$mutation)[[1]] == nrow(gene_metadata_raw_unambiguous)
+nrow(gene_metadata_raw_unambiguous) + nrow(common_muts_all) == nrow(gene_metadata_raw)
+
+# check before resolving ambiguous muts
+table(common_muts_all$mutation_info)
+common_muts_all$mutation_info = as.factor(common_muts_all$mutation_info)
+
+# resolving ambiguous muts: change other_muts to dr_muts
+common_muts_all$mutation_info[common_muts_all$mutation_info==other_muts_col] <- dr_muts_col
+
+table(common_muts_all$mutation_info)
+common_muts_all$mutation_info = factor(common_muts_all$mutation_info)
+table(common_muts_all$mutation_info)
+common_muts_all$mutation_info =  as.character(common_muts_all$mutation_info)
+
+# combining to a clean df
+gene_metadata = rbind(gene_metadata_raw_unambiguous, common_muts_all)
+all(dim(gene_metadata) == dim(gene_metadata))
+table(gene_metadata$mutation_info)
+
+count_check2 = as.data.frame(cbind(table(gene_metadata$mutationinformation
+                                         , gene_metadata$mutation_info)))
+
+count_check2$checks = ifelse(count_check2[[dr_muts_col]]&count_check2[[other_muts_col]]>0
+                            , "ambi", "pass")
+
+if (table(count_check2$checks) == nrow(count_check2)){
+  cat("PASS: ambiguous muts successfully resolved."
+      , "\nwriting file")
+}else{
+  print("FAIL: ambiguous muts could not be resolved!")
+  quit()
+}
+#============
+# writing file
+#=============
+write.csv(gene_metadata, outfile_gene_metadata_clean
+          , row.names = FALSE)
+
+#=====================================================================
+# End of script
+#======================================================================

From 774b34ef00806ee1ac53fabb54881e1963884570 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Wed, 9 Sep 2020 11:45:09 +0100
Subject: [PATCH 16/27] updated dir.R

---
 scripts/plotting/dirs.R | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/scripts/plotting/dirs.R b/scripts/plotting/dirs.R
index 789225f..54fb0bf 100644
--- a/scripts/plotting/dirs.R
+++ b/scripts/plotting/dirs.R
@@ -50,16 +50,6 @@ dr_muts_col = paste0('dr_mutations_', drug)
 other_muts_col  = paste0('other_mutations_', drug)
 resistance_col = "drtype"
 
-cat('columns based on variables:\n'
-      , drug
-      , '\n'
-      , dr_muts_col
-      , '\n'
-      , other_muts_col
-      , "\n"
-      , resistance_col
-      , '\n===============================================================')
-
 #%%===============================================================
 
 

From 31b98fb3d37eb31b484168e353096372f3237327 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Wed, 9 Sep 2020 12:00:42 +0100
Subject: [PATCH 17/27]  plotting script with resolved gene metadata

---
 scripts/plotting/combining_dfs_plotting.R | 186 ++++++----------------
 1 file changed, 48 insertions(+), 138 deletions(-)

diff --git a/scripts/plotting/combining_dfs_plotting.R b/scripts/plotting/combining_dfs_plotting.R
index 5f5d284..7549de6 100644
--- a/scripts/plotting/combining_dfs_plotting.R
+++ b/scripts/plotting/combining_dfs_plotting.R
@@ -36,18 +36,22 @@ source("plotting_data.R")
 # my_df_u_lig
 # dup_muts
 
-cat(paste0("Directories imported:"
-           , "\ndatadir:", datadir
-           , "\nindir:", indir
-           , "\noutdir:", outdir
-           , "\nplotdir:", plotdir))
+cat("Directories imported:"
+    , "\n===================="
+    , "\ndatadir:", datadir
+    , "\nindir:", indir
+    , "\noutdir:", outdir
+    , "\nplotdir:", plotdir)
+
+cat("Variables imported:"
+    , "\n====================="
+    , "\ndrug:", drug
+    , "\ngene:", gene
+    , "\ngene_match:", gene_match
+    , "\ndr_muts_col:", dr_muts_col
+    , "\nother_muts_col:", other_muts_col
+    , "\ndrtype_col:", resistance_col)
 
-cat(paste0("Variables imported:"
-           , "\ndrug:", drug
-           , "\ngene:", gene
-           , "\ngene_match:", gene_match
-           , "\nLength of upos:", length(upos)
-           , "\nAngstrom symbol:", angstroms_symbol))       
 
 # clear excess variable
 rm(my_df, upos, dup_muts)
@@ -58,25 +62,6 @@ rm(my_df, upos, dup_muts)
 #in_file1: output of plotting_data.R
 # my_df_u
 
-# quick checks
-head(my_df_u[, c("mutation")])
-
-cols_to_extract  = c("mutationinformation", "mutation", "or_mychisq", "or_kin", "af", "af_kin")
-foo = my_df_u[, colnames(my_df_u)%in% cols_to_extract]
-
-
-table(which(is.na(my_df_u$af_kin)) == which(is.na(my_df_u$af)))
-
-baz = read.csv(file.choose())
-
-baz = cbind(my_df_u$mutation, my_df_u$or_mychisq, bar$mutation, bar$or_mychisq)
-baz = as.data.frame(baz)
-colnames(baz) = c("my_df_u_muts", "my_df_u_or", "real_muts", "real_or")
-sum(is.na(baz$my_df_u_or)) == sum(is.na(my_df_u$or_mychisq))
-
-cat("\nNo. of with NA in or_mychisq:", sum(is.na(my_df_u$or_mychisq))
-    ,"\nNo. of NA in or_kin:" , sum(is.na(my_df_u$or_kin)))
-
 # infile 2: gene associated meta data
 #in_filename_gene_metadata = paste0(tolower(gene),  "_meta_data_with_AFandOR.csv")
 in_filename_gene_metadata = paste0(tolower(gene),  "_metadata.csv")
@@ -113,6 +98,8 @@ gene_metadata <- read.csv(infile_gene_metadata
                       , header = T)
 cat("Dim:", dim(gene_metadata))
 
+table(gene_metadata$mutation_info)
+
 
 # counting NAs in AF, OR cols
 # or_mychisq
@@ -145,66 +132,6 @@ if (identical(sum(is.na(my_df_u$or_kin))
 
 str(gene_metadata)
 
-# change category of ambiguos mutations
-table(gene_metadata$mutation_info)
-
-cols_to_extract2  = c("mutationinformation", "mutation", "mutation_info")
-foo2 = gene_metadata[, colnames(gene_metadata)%in% cols_to_extract2]
-
-dr_muts = foo2[foo2$mutation_info == dr_muts_col,]
-other_muts = foo2[foo2$mutation_info == other_muts_col,]
-
-common_muts = dr_muts[dr_muts$mutation%in%other_muts$mutation,]
-#write.csv(common_muts, 'common_muts.csv')
-rm(common_muts)
-
-# FIXME read properly
-# "ambiguous_mut_names.csv"
-#"pnca_p.gly108arg", "pnca_p.gly132ala", "pnca_p.val180phe"
-ambiguous_muts = read.csv(file.choose())
-ambiguous_muts_names = ambiguous_muts$mutation
-
-common_muts_all = gene_metadata[gene_metadata$mutation%in%ambiguous_muts_names,]
-
-if (gene_metadata$mutation_info[gene_metadata$mutation%in%ambiguous_muts_names] == other_muts_col){
-  print('change me')
-}
-
-# make a copy
-gene_metadata2 = gene_metadata
-table(gene_metadata$mutation_info)
-count_check = as.data.frame(cbind(table(gene_metadata$mutationinformation, gene_metadata$mutation_info)))
-#count_check$checks = ifelse(count_check$dr_mutations_pyrazinamide&&count_check$other_mutations_pyrazinamide>0, "ambi", "pass")
-table(count_check$checks)
-
-
-poo = c("V180F", "G132A", "D49G")
-poo2 = count_check[rownames(count_check)%in%poo,]
-poo2[[dr_muts_col]]&& poo2[[other_muts_col]]>0
-poo2$checks = ifelse(poo2$checkspoo2[[dr_muts_col]]&& poo2[[other_muts_col]]>0, "ambi", "pass")
-
-# remove common_muts_all
-ids = gene_metadata$mutation%in%common_muts_all$mutation; table(ids)
-gene_metadata_unambiguous = gene_metadata2[!ids,]
-
-# sanity checks: should be true
-table(gene_metadata_unambiguous$mutation%in%common_muts_all$mutation)[[1]] == nrow(gene_metadata_unambiguous)
-nrow(gene_metadata_unambiguous) + nrow(common_muts_all) == nrow(gene_metadata)
-
-# correct common muts
-table(common_muts_all$mutation_info)
-common_muts_all$mutation_info = as.factor(common_muts_all$mutation_info)
-
-# change the other_muts to dr_muts
-common_muts_all$mutation_info[common_muts_all$mutation_info==other_muts_col] <- dr_muts_col
-
-table(common_muts_all$mutation_info)
-common_muts_all$mutation_info = factor(common_muts_all$mutation_info)
-table(common_muts_all$mutation_info)
-
-# add it back to 
-gene_meta_data
-
 ###################################################################
 #                           combining: PS
 ###################################################################
@@ -307,15 +234,6 @@ if (identical(sum(is.na(merged_df3$or_kin))
       , "\nNA in AF:", sum(is.na(merged_df3$af_kin)))
 }
 
-# check if the same or and afs are missing for 
-if ( identical( which(is.na(merged_df2$or_mychisq)), which(is.na(merged_df2$or_kin)))
-  && identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin)))
-  && identical( which(is.na(merged_df2$pval_fisher)), which(is.na(merged_df2$pwald_kin))) ){
-  cat("PASS: Indices match for mychisq and kin ors missing values")
-} else{
-  cat("Index mismatch: mychisq and kin ors missing indices match")
-  quit()
-}
 
 #=========================
 # Merge3: merged_df2_comp
@@ -325,25 +243,21 @@ cat("Merging dfs without any NAs: big df (1-many relationship b/w id & mut)"
     ,"\nlinking col: Mutationinforamtion"
     ,"\nfilename: merged_df2_comp")
 
-if ( identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin))) ){
-  print("mychisq and kin ors missing indices match. Procedding with omitting NAs")
-  na_count_df2 = sum(is.na(merged_df2$af))
-  merged_df2_comp = merged_df2[!is.na(merged_df2$af),] 
-  # sanity check: no +-1 gymnastics
-  cat("Checking nrows in merged_df2_comp")
-  if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count_df2)){
-    cat("\nPASS: No. of rows match"
-        ,"\nDim of merged_df2_comp: "
-        ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2
-        , "\nNo. of rows: ", nrow(merged_df2_comp)
-        , "\nNo. of cols: ", ncol(merged_df2_comp))
-  }else{
-    cat("FAIL: No. of rows mismatch"
-        ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2
-        ,"\nGot no. of rows: ", nrow(merged_df2_comp))
-  }
+na_count_df2 = sum(is.na(merged_df2$af))
+merged_df2_comp = merged_df2[!is.na(merged_df2$af),] 
+
+# sanity check: no +-1 gymnastics
+cat("Checking nrows in merged_df2_comp")
+if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count_df2)){
+  cat("\nPASS: No. of rows match"
+      ,"\nDim of merged_df2_comp: "
+      ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2
+      , "\nNo. of rows: ", nrow(merged_df2_comp)
+      , "\nNo. of cols: ", ncol(merged_df2_comp))
 }else{
-  print("Index mismatch for mychisq and kin ors. Aborting NA ommission")
+  cat("FAIL: No. of rows mismatch"
+      ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2
+      ,"\nGot no. of rows: ", nrow(merged_df2_comp))
 }
 
 #=========================
@@ -351,28 +265,24 @@ if ( identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin))) ){
 # same as merge 2 but excluding NAs from ORs, etc or 
 # remove duplicate mutation information
 #=========================
+na_count_df3 = sum(is.na(merged_df3$af))
+#merged_df3_comp = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] # a way
+
+merged_df3_comp = merged_df3[!is.na(merged_df3$af),] # another way
+cat("Checking nrows in merged_df3_comp")
+
+if(nrow(merged_df3_comp) == (nrow(merged_df3) - na_count_df3)){
+  cat("\nPASS: No. of rows match"
+      ,"\nDim of merged_df3_comp: "
+      ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3
+      , "\nNo. of rows: ", nrow(merged_df3_comp)
+      , "\nNo. of cols: ", ncol(merged_df3_comp))
+}else{
+  cat("FAIL: No. of rows mismatch"
+      ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3
+      ,"\nGot no. of rows: ", nrow(merged_df3_comp))
+}
 
-if ( identical( which(is.na(merged_df3$af)), which(is.na(merged_df3$af_kin))) ){
-  print("mychisq and kin ors missing indices match. Procedding with omitting NAs")
-  na_count_df3 = sum(is.na(merged_df3$af))
-  #merged_df3_comp = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] # a way
-  merged_df3_comp = merged_df3[!is.na(merged_df3$af),] # another way
-  cat("Checking nrows in merged_df3_comp")
-  if(nrow(merged_df3_comp) == (nrow(merged_df3) - na_count_df3)){
-    cat("\nPASS: No. of rows match"
-        ,"\nDim of merged_df3_comp: "
-        ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3
-        , "\nNo. of rows: ", nrow(merged_df3_comp)
-        , "\nNo. of cols: ", ncol(merged_df3_comp))
-    }else{
-    cat("FAIL: No. of rows mismatch"
-        ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3
-        ,"\nGot no. of rows: ", nrow(merged_df3_comp))
-   }
-} else{
-  print("Index mismatch for mychisq and kin ors. Aborting NA ommission")
-} 
-  
 # alternate way of deriving merged_df3_comp
 foo = merged_df3[!is.na(merged_df3$af),]
 bar = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),]

From 19a984f22813254c3d9b8be80067e5ff77a30154 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Wed, 9 Sep 2020 12:53:53 +0100
Subject: [PATCH 18/27] generated lineage dist plots combined. needs tweaking

---
 scripts/plotting/combining_dfs_plotting.R |   2 +
 scripts/plotting/lineage_basic_barplot.R  |  35 +--
 scripts/plotting/lineage_dist_PS.R        | 273 +++++++++++++---------
 3 files changed, 191 insertions(+), 119 deletions(-)

diff --git a/scripts/plotting/combining_dfs_plotting.R b/scripts/plotting/combining_dfs_plotting.R
index 7549de6..38ddd0f 100644
--- a/scripts/plotting/combining_dfs_plotting.R
+++ b/scripts/plotting/combining_dfs_plotting.R
@@ -48,6 +48,8 @@ cat("Variables imported:"
     , "\ndrug:", drug
     , "\ngene:", gene
     , "\ngene_match:", gene_match
+    , "\nAngstrom symbol:", angstroms_symbol
+    , "\nNo. of duplicated muts:", dup_muts_nu
     , "\ndr_muts_col:", dr_muts_col
     , "\nother_muts_col:", other_muts_col
     , "\ndrtype_col:", resistance_col)
diff --git a/scripts/plotting/lineage_basic_barplot.R b/scripts/plotting/lineage_basic_barplot.R
index 8b107bb..e4503d1 100644
--- a/scripts/plotting/lineage_basic_barplot.R
+++ b/scripts/plotting/lineage_basic_barplot.R
@@ -30,21 +30,27 @@ source("combining_dfs_plotting.R")
 # 9) my_df_u
 # 10) my_df_u_lig
 
-cat(paste0("Directories imported:"
-           , "\ndatadir:", datadir
-           , "\nindir:", indir
-           , "\noutdir:", outdir
-           , "\nplotdir:", plotdir))
+cat("Directories imported:"
+    , "\n===================="
+    , "\ndatadir:", datadir
+    , "\nindir:", indir
+    , "\noutdir:", outdir
+    , "\nplotdir:", plotdir)
+
+cat("Variables imported:"
+    , "\n====================="
+    , "\ndrug:", drug
+    , "\ngene:", gene
+    , "\ngene_match:", gene_match
+    , "\nAngstrom symbol:", angstroms_symbol
+    , "\nNo. of duplicated muts:", dup_muts_nu
+    , "\nNA count for ORs:", na_count
+    , "\nNA count in df2:", na_count_df2
+    , "\nNA count in df3:", na_count_df3
+    , "\ndr_muts_col:", dr_muts_col
+    , "\nother_muts_col:", other_muts_col
+    , "\ndrtype_col:", resistance_col)
 
-cat(paste0("Variables imported:"
-           , "\ndrug:", drug
-           , "\ngene:", gene
-           , "\ngene_match:", gene_match
-           , "\nAngstrom symbol:", angstroms_symbol
-           , "\nNo. of duplicated muts:", dup_muts_nu
-           , "\nNA count for ORs:", na_count
-           , "\nNA count in df2:", na_count_df2
-           , "\nNA count in df3:", na_count_df3))       
 
 #===========
 # input
@@ -67,7 +73,6 @@ plot_basic_bp_lineage  =  paste0(plotdir,"/", basic_bp_lineage)
 #================
 # REASSIGNMENT as necessary
 my_df  = merged_df2
-#my_df  = merged_df2_comp
 
 # clear excess variable
 rm(merged_df2_comp, merged_df3, merged_df3_comp)
diff --git a/scripts/plotting/lineage_dist_PS.R b/scripts/plotting/lineage_dist_PS.R
index 432af84..1c46f57 100644
--- a/scripts/plotting/lineage_dist_PS.R
+++ b/scripts/plotting/lineage_dist_PS.R
@@ -3,36 +3,55 @@ getwd()
 setwd("~/git/LSHTM_analysis/scripts/plotting/")
 getwd()
 
-########################################################################
-# 				Installing and loading required packages 			   #
-########################################################################
+#########################################################
+# TASK: Lineage dist plots: ggridges
 
-source("../Header_TT.R")
-#source("../barplot_colour_function.R")
-#require(data.table)
+# Output: 2 SVGs for PS stability
 
-########################################################################
-#		 Read file: call script for combining df for PS			   	   #
-########################################################################
+# 1) all muts
+# 2) dr_muts
 
-source("combining_two_df.R")
+##########################################################
+# 				Installing and loading required packages 			 
+##########################################################
 
-#---------------------- PAY ATTENTION
-# the above changes the working dir
-#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
-#---------------------- PAY ATTENTION
+source("combining_dfs_plotting.R")
+# PS combined: 
+# 1) merged_df2
+# 2) merged_df2_comp
+# 3) merged_df3
+# 4) merged_df3_comp
 
-#==========================
-# This will return:
+# LIG combined: 
+# 5) merged_df2_lig
+# 6) merged_df2_comp_lig
+# 7) merged_df3_lig
+# 8) merged_df3_comp_lig
 
-# df with NA for <drug>
-# merged_df2
-# merged_df3
+# 9) my_df_u
+# 10) my_df_u_lig
+
+cat("Directories imported:"
+    , "\n===================="
+    , "\ndatadir:", datadir
+    , "\nindir:", indir
+    , "\noutdir:", outdir
+    , "\nplotdir:", plotdir)
+
+cat("Variables imported:"
+    , "\n====================="
+    , "\ndrug:", drug
+    , "\ngene:", gene
+    , "\ngene_match:", gene_match
+    , "\nAngstrom symbol:", angstroms_symbol
+    , "\nNo. of duplicated muts:", dup_muts_nu
+    , "\nNA count for ORs:", na_count
+    , "\nNA count in df2:", na_count_df2
+    , "\nNA count in df3:", na_count_df3
+    , "\ndr_muts_col:", dr_muts_col
+    , "\nother_muts_col:", other_muts_col
+    , "\ndrtype_col:", resistance_col)
 
-# df without NA for <drug>
-# merged_df2_comp
-# merged_df3_comp
-#===========================
 
 ###########################
 # Data for plots
@@ -43,13 +62,8 @@ source("combining_two_df.R")
 # we lose some muts and at this level, we should use
 # as much info as available, hence use df with NA
 ###########################
-
-# uncomment as necessary
-#%%%%%%%%%%%%%%%%%%%%%%%%
 # REASSIGNMENT
 my_df  = merged_df2
-#my_df  = merged_df2_comp
-#%%%%%%%%%%%%%%%%%%%%%%%%
 
 # delete variables not required
 rm(my_df_u, merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
@@ -63,7 +77,7 @@ is.factor(my_df$lineage)
 my_df$lineage = as.factor(my_df$lineage)
 is.factor(my_df$lineage)
 
-table(my_df$mutation_info); str(my_df$mutation_info)
+table(my_df$mutation_info)
 
 # subset df with dr muts only
 my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide") 
@@ -74,59 +88,32 @@ table(my_df_dr$mutation_info)
 ########################################################################
 
 #==========================
-# Run two times: 
-# uncomment as necessary
-# 1) for all muts
-# 2) for dr_muts
-#===========================
-
-#%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT
-
-#================
-# for ALL muts
-#================
-plot_df = my_df  
-my_plot_name = 'lineage_dist_PS.svg'
-
-plot_lineage_duet  =  paste0(plotdir,"/", my_plot_name)
-
-#my_plot_name = 'lineage_dist_PS_comp.svg'
-
-#================
-# for dr muts ONLY
-#================
-plot_df = my_df_dr 
-#my_plot_name = 'lineage_dist_dr_PS.svg'
-#my_plot_name = 'lineage_dist_dr_PS_comp.svg'
-my_plot_name = 'lineage_dist_drug_muts_PS.svg'
-
-plot_lineage_duet  =  paste0(plotdir,"/", my_plot_name)
-
-#%%%%%%%%%%%%%%%%%%%%%%%%
-
-#==========================
-# Plot: Lineage Distribution
+# Plot 1: ALL Muts
 # x = mcsm_values, y = dist
 # fill = stability
 #============================
 
+my_plot_name = 'lineage_dist_PS.svg'
+
+plot_lineage_duet  =  paste0(plotdir,"/", my_plot_name)
+
 #===================
 # Data for plots
 #===================
-table(plot_df$lineage); str(plot_df$lineage)
+table(my_df$lineage); str(my_df$lineage)
 
 # subset only lineages1-4
 sel_lineages = c("lineage1"
                  , "lineage2"
                  , "lineage3"
-                 , "lineage4")
+                 , "lineage4"
                  #, "lineage5"
                  #, "lineage6"
-                 #, "lineage7")
+                 #, "lineage7"
+                 )
 
 # uncomment as necessary
-df_lin = subset(plot_df, subset = lineage %in% sel_lineages )
+df_lin = subset(my_df, subset = lineage %in% sel_lineages )
 table(df_lin$lineage)
 
 # refactor
@@ -139,16 +126,8 @@ table(df_lin$lineage)#{RESULT: No of samples within lineage}
 length(unique(df_lin$mutationinformation))#{Result: No. of unique mutations the 4 lineages contribute to}
 
 length(df_lin$mutationinformation)
-# sanity checks
-# FIXME
-r1 = 2:7 # when merged_df2 used: because there is missing lineages 
-if(sum(table(plot_df$lineage)[r1]) == nrow(df_lin)) {
-  print ("sanity check passed: numbers match")
-} else{
-  print("Error!: check your numbers")
-} 
 
-u2 = unique(plot_df$mutationinformation)
+u2 = unique(my_df$mutationinformation)
 u = unique(df_lin$mutationinformation)
 check = u2[!u2%in%u]; print(check) #{Muts not present within selected lineages}
 
@@ -173,11 +152,11 @@ names(my_labels) = c('lineage1', 'lineage2', 'lineage3', 'lineage4'
                     # , 'lineage5', 'lineage6', 'lineage7'
                      )
 # check plot name
-my_plot_name
+plot_lineage_duet
 
 # output svg
-svg(plot_lineage_duet)
-printFile = ggplot(df, aes(x = duet_scaled
+#svg(plot_lineage_duet)
+p1 = ggplot(df, aes(x = duet_scaled
                             , y = duet_outcome))+
   
   #printFile=geom_density_ridges_gradient(
@@ -186,47 +165,133 @@ printFile = ggplot(df, aes(x = duet_scaled
                                 , size = 0.3 ) +
   facet_wrap( ~lineage
               , scales = "free"
-#             , switch = 'x'
+             #, switch = 'x'
               , labeller = labeller(lineage = my_labels) ) +
-  coord_cartesian( xlim = c(-1, 1)
-#                   , ylim = c(0, 6)
-#                   , clip = "off" 
-) +
+  coord_cartesian( xlim = c(-1, 1)) +
   scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4")
                         , name = "DUET" ) + 
   theme(axis.text.x = element_text(size = my_ats
                                      , angle = 90
                                      , hjust = 1
                                      , vjust = 0.4)
-#                  , axis.text.y = element_text(size = my_ats
-#                                                , angle = 0
-#                                                , hjust = 1
-#                                                , vjust = 0)
+
          , axis.text.y = element_blank()
          , axis.title.x = element_blank()
          , axis.title.y = element_blank()
          , axis.ticks.y = element_blank()
          , plot.title = element_blank()
          , strip.text = element_text(size = my_als)
-         , legend.text = element_text(size = 10)
+         , legend.text = element_text(size = my_als-5)
          , legend.title = element_text(size = my_als)
-#                  , legend.position = c(0.3, 0.8)
-#                  , legend.key.height = unit(1, 'mm')
+) 
+
+print(p1)
+#dev.off()
+
+#######################################################################
+# lineage distribution plot for dr_muts
+#######################################################################
+
+#==========================
+# Plot 2: dr muts ONLY
+# x = mcsm_values, y = dist
+# fill = stability
+#============================
+
+my_plot_name_dr = 'lineage_dist_dr_muts_PS.svg'
+
+plot_lineage_dr_duet  =  paste0(plotdir,"/", my_plot_name_dr)
+
+#===================
+# Data for plots
+#===================
+table(my_df_dr$lineage); str(my_df_dr$lineage)
+
+# uncomment as necessary
+df_lin_dr = subset(my_df_dr, subset = lineage %in% sel_lineages)
+table(df_lin_dr$lineage)
+
+# refactor
+df_lin_dr$lineage = factor(df_lin_dr$lineage)
+
+sum(table(df_lin_dr$lineage)) #{RESULT: Total number of samples for lineage}
+
+table(df_lin_dr$lineage)#{RESULT: No of samples within lineage}
+
+length(unique(df_lin_dr$mutationinformation))#{Result: No. of unique mutations the 4 lineages contribute to}
+
+length(df_lin_dr$mutationinformation)
+
+u2 = unique(my_df_dr$mutationinformation)
+u = unique(df_lin_dr$mutationinformation)
+check = u2[!u2%in%u]; print(check) #{Muts not present within selected lineages}
+
+#%%%%%%%%%%%%%%%%%%%%%%%%%
+# REASSIGNMENT
+df_dr <- df_lin_dr
+#%%%%%%%%%%%%%%%%%%%%%%%%%
+
+rm(df_lin_dr)
+
+#******************
+# generate distribution plot of lineages
+#******************
+# 2 : ggridges (good!)
+my_ats = 15 # axis text size
+my_als = 20 # axis label size
+
+
+# check plot name
+plot_lineage_dr_duet
+
+# output svg
+#svg(plot_lineage_dr_duet)
+p2 = ggplot(df_dr, aes(x = duet_scaled
+                           , y = duet_outcome))+
+  
+  #printFile=geom_density_ridges_gradient(
+  geom_density_ridges_gradient(aes(fill = ..x..)
+                               , scale = 3
+                               , size = 0.3 ) +
+  facet_wrap( ~lineage
+              , scales = "free"
+              #, switch = 'x'
+              , labeller = labeller(lineage = my_labels) ) +
+  coord_cartesian( xlim = c(-1, 1)
+                   #, ylim = c(0, 6)
+                   #, clip = "off" 
+  ) +
+  scale_fill_gradientn(colours = c("#f8766d", "white", "#00bfc4")
+                       , name = "DUET" ) + 
+  theme(axis.text.x = element_text(size = my_ats
+                                   , angle = 90
+                                   , hjust = 1
+                                   , vjust = 0.4)
+        , axis.text.y = element_blank()
+        , axis.title.x = element_blank()
+        , axis.title.y = element_blank()
+        , axis.ticks.y = element_blank()
+        , plot.title = element_blank()
+        , strip.text = element_text(size = my_als)
+        #, legend.text = element_text(size = 10)
+        #, legend.title = element_text(size = my_als)
+        , legend.position = "none"
   ) 
 
+print(p2)
+#dev.off()
+#====================
+#combine
+#=======
+# output
+#=======
+lineage_dist_combined = "lineage_dist_combined_PS.svg"
+plot_lineage_dist_combined  =  paste0(plotdir,"/", lineage_dist_combined)
+
+svg(plot_lineage_dist_combined, width = 16, height = 12)
+
+printFile = cowplot::plot_grid(p1, p2
+                               , label_size = my_als+10)
+
 print(printFile)
 dev.off()
-
-#=!=!=!=!=!=!=!
-# COMMENT: Not much differences in the distributions
-# when using merged_df2 or merged_df2_comp.
-# Also, the lineage differences disappear when looking at all muts
-# The pattern we are interested in is possibly only for dr_mutations
-#=!=!=!=!=!=!=!
-#===================================================
-
-# COMPARING DISTRIBUTIONS: KS test 
-# run: "../KS_test_PS.R"
-
-
-

From 080cd6375d39014dbdbb6be243e63db6fa1d0520 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Wed, 9 Sep 2020 13:18:57 +0100
Subject: [PATCH 19/27] lineage dist plots combined generated  Please enter the
 commit message for your changes. Lines starting

---
 scripts/plotting/lineage_dist_PS.R    | 19 ++++++++++---------
 scripts/plotting/running_plot_scripts | 17 +++++++++++++++++
 2 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/scripts/plotting/lineage_dist_PS.R b/scripts/plotting/lineage_dist_PS.R
index 1c46f57..d62b038 100644
--- a/scripts/plotting/lineage_dist_PS.R
+++ b/scripts/plotting/lineage_dist_PS.R
@@ -1,8 +1,4 @@
-#!/usr/bin/env Rscript       
-getwd()
-setwd("~/git/LSHTM_analysis/scripts/plotting/")
-getwd()
-
+#!/usr/bin/env Rscript  
 #########################################################
 # TASK: Lineage dist plots: ggridges
 
@@ -14,7 +10,12 @@ getwd()
 ##########################################################
 # 				Installing and loading required packages 			 
 ##########################################################
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/plotting/")
+getwd()
 
+source("Header_TT.R")
+library(ggridges)
 source("combining_dfs_plotting.R")
 # PS combined: 
 # 1) merged_df2
@@ -273,9 +274,9 @@ p2 = ggplot(df_dr, aes(x = duet_scaled
         , axis.ticks.y = element_blank()
         , plot.title = element_blank()
         , strip.text = element_text(size = my_als)
-        #, legend.text = element_text(size = 10)
-        #, legend.title = element_text(size = my_als)
-        , legend.position = "none"
+        , legend.text = element_text(size = 10)
+        , legend.title = element_text(size = my_als)
+        #, legend.position = "none"
   ) 
 
 print(p2)
@@ -288,7 +289,7 @@ print(p2)
 lineage_dist_combined = "lineage_dist_combined_PS.svg"
 plot_lineage_dist_combined  =  paste0(plotdir,"/", lineage_dist_combined)
 
-svg(plot_lineage_dist_combined, width = 16, height = 12)
+svg(plot_lineage_dist_combined, width = 12, height = 6)
 
 printFile = cowplot::plot_grid(p1, p2
                                , label_size = my_als+10)
diff --git a/scripts/plotting/running_plot_scripts b/scripts/plotting/running_plot_scripts
index 365fcb5..6749fa1 100644
--- a/scripts/plotting/running_plot_scripts
+++ b/scripts/plotting/running_plot_scripts
@@ -3,6 +3,9 @@
 #========
 FIXME:
 
+#=======================
+dirs.R
+#=======================
 #dirs.R: imports dir structure
 change this to cmd so you can pass in drug and gene name
 
@@ -11,6 +14,13 @@ plotting_data.R
 #=======================
 ??? update how to run
 
+
+#=======================
+combining_dfs_plotting.R
+#=======================
+??? update how to run
+
+
 #=======================
 mcsm_mean_stability.R
 #=======================
@@ -62,6 +72,13 @@ barplots_subcolours_PS.R
 # output plots: 1 svg
     1) barplot_coloured_PS.svg
 
+#=======================
+lineage_dist_PS.R
+# lineage distribution for all muts and dr muts in one figure
+#=======================
+# input: calls the "combining_dfs_plotting.R"
+# output plots: 1 svg
+    1) lineage_dist_combined_PS.svg
 
     
 ########################################################################

From f424f4e2d69e963c5f3e752adbd699a8ca25fd6b Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Wed, 9 Sep 2020 13:36:37 +0100
Subject: [PATCH 20/27] corrected subcols_axis name in sucols_all_PS

---
 scripts/plotting/barplots_subcolours_aa_PS.R |  2 +-
 scripts/plotting/columns_all_params.csv      | 87 --------------------
 2 files changed, 1 insertion(+), 88 deletions(-)
 delete mode 100644 scripts/plotting/columns_all_params.csv

diff --git a/scripts/plotting/barplots_subcolours_aa_PS.R b/scripts/plotting/barplots_subcolours_aa_PS.R
index 5850abf..95a94a9 100755
--- a/scripts/plotting/barplots_subcolours_aa_PS.R
+++ b/scripts/plotting/barplots_subcolours_aa_PS.R
@@ -20,7 +20,7 @@ library(ggplot2)
 library(data.table)
 source("barplot_colour_function.R")
 #source("subcols_axis.R")
-source("subcols_axis_PS.R")
+source("subcols_axis.R")
 
 # should return the following dfs, directories and variables
 # mut_pos_cols
diff --git a/scripts/plotting/columns_all_params.csv b/scripts/plotting/columns_all_params.csv
deleted file mode 100644
index fb8adc3..0000000
--- a/scripts/plotting/columns_all_params.csv
+++ /dev/null
@@ -1,87 +0,0 @@
-,x,,changes,
-1,mutationinformation,,Mutationinformation,
-2,wild_type,,,consider...wild_aa
-3,position,,Position,
-4,mutant_type,,,consider...mutant_aa
-5,chain,,,
-6,ligand_id,,,
-7,ligand_distance,,,
-8,duet_stability_change,,,
-9,duet_outcome,,DUET_outcome,
-10,ligand_affinity_change,,,
-11,ligand_outcome,,Lig_outcome,
-12,duet_scaled,,ratioDUET,
-13,affinity_scaled,,ratioPredAff,
-14,wild_pos,,WildPos,
-15,wild_chain_pos,,,
-16,ddg,,,
-17,contacts,,,
-18,electro_rr,,,
-19,electro_mm,,,
-20,electro_sm,,,
-21,electro_ss,,,
-22,disulfide_rr,,,
-23,disulfide_mm,,,
-24,disulfide_sm,,,
-25,disulfide_ss,,,
-26,hbonds_rr,,,
-27,hbonds_mm,,,
-28,hbonds_sm,,,
-29,hbonds_ss,,,
-30,partcov_rr,,,
-31,partcov_mm,,,
-32,partcov_sm,,,
-33,partcov_ss,,,
-34,vdwclashes_rr,,,
-35,vdwclashes_mm,,,
-36,vdwclashes_sm,,,
-37,vdwclashes_ss,,,
-38,volumetric_rr,,,
-39,volumetric_mm,,,
-40,volumetric_sm,,,
-41,volumetric_ss,,,
-42,wild_type_dssp,,,
-43,asa,,,
-44,rsa,,,
-45,ss,,,
-46,ss_class,,,
-47,chain_id,,,
-48,wild_type_kd,,,
-49,kd_values,,,
-50,rd_values,,,
-51,wt_3letter_caps,,,
-52,mutation,,,
-53,af,,,
-54,beta_logistic,,,
-55,or_logistic,,,
-56,pval_logistic,,,
-57,se_logistic,,,
-58,zval_logistic,,,
-59,ci_low_logistic,,,
-60,ci_hi_logistic,,,
-61,or_mychisq,,,
-62,or_fisher,,,
-63,pval_fisher,,,
-64,ci_low_fisher,,,
-65,ci_hi_fisher,,,
-66,est_chisq,,,
-67,pval_chisq,,,
-68,chromosome_number,,,
-69,ref_allele,,,
-70,alt_allele,,,
-71,mut_type,,,
-72,gene_id,,,
-73,gene_number,,,
-74,mut_region,,,
-75,mut_info,,,
-76,chr_num_allele,,,
-77,wt_3let,,,
-78,mt_3let,,,
-79,af_kin,,,
-80,or_kin,,,
-81,pwald_kin,,,
-82,beta_kin,,,
-83,se_kin,,,
-84,logl_h1_kin,,,
-85,l_remle_kin,,,
-86,n_miss,,,

From 5025e47983a49fbbfd624427662ce4bad5101dc0 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Wed, 9 Sep 2020 17:34:32 +0100
Subject: [PATCH 21/27] renamed lineage_dist

---
 scripts/plotting/barplots_subcolours_aa_PS.R                   | 3 +--
 .../plotting/{lineage_dist_PS.R => lineage_dist_combined_PS.R} | 0
 2 files changed, 1 insertion(+), 2 deletions(-)
 rename scripts/plotting/{lineage_dist_PS.R => lineage_dist_combined_PS.R} (100%)

diff --git a/scripts/plotting/barplots_subcolours_aa_PS.R b/scripts/plotting/barplots_subcolours_aa_PS.R
index 95a94a9..ea0b690 100755
--- a/scripts/plotting/barplots_subcolours_aa_PS.R
+++ b/scripts/plotting/barplots_subcolours_aa_PS.R
@@ -1,5 +1,4 @@
-#!/usr/bin/env Rscript   
-getwd()
+#!/usr/bin/env Rscript   getwd()
 setwd("~/git/LSHTM_analysis/scripts/plotting")
 getwd()
 
diff --git a/scripts/plotting/lineage_dist_PS.R b/scripts/plotting/lineage_dist_combined_PS.R
similarity index 100%
rename from scripts/plotting/lineage_dist_PS.R
rename to scripts/plotting/lineage_dist_combined_PS.R

From e570454cf2aa7c9a509ab5f78d6c27946312f783 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Wed, 9 Sep 2020 18:56:59 +0100
Subject: [PATCH 22/27] saving work

---
 scripts/plotting/barplots_subcolours_aa_PS.R | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/scripts/plotting/barplots_subcolours_aa_PS.R b/scripts/plotting/barplots_subcolours_aa_PS.R
index ea0b690..7c0a3aa 100755
--- a/scripts/plotting/barplots_subcolours_aa_PS.R
+++ b/scripts/plotting/barplots_subcolours_aa_PS.R
@@ -18,7 +18,6 @@ getwd()
 library(ggplot2)
 library(data.table)
 source("barplot_colour_function.R")
-#source("subcols_axis.R")
 source("subcols_axis.R")
 
 # should return the following dfs, directories and variables
@@ -160,9 +159,7 @@ min(df$duet_scaled)
 max(df$duet_scaled)
 
 # sanity checks
-# very important!!!!
 tapply(df$duet_scaled, df$duet_outcome, min)
-
 tapply(df$duet_scaled, df$duet_outcome, max)
 
 # My colour FUNCTION: based on group and subgroup
@@ -240,7 +237,7 @@ outPlot = g +
          , axis.ticks.x = element_blank()) +
   labs(title = ""
        #title = my_title
-       , x = "position"
+       , x = "Position"
        , y = "Frequency")
 
 print(outPlot)

From f85b1bd9022d049d60db1525b756e36cd83909d1 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Wed, 9 Sep 2020 18:57:28 +0100
Subject: [PATCH 23/27] script to generate combined ps plot wit af and or

---
 scripts/plotting/ps_plots_combined.R | 251 +++++++++++++++++++++++++++
 1 file changed, 251 insertions(+)
 create mode 100644 scripts/plotting/ps_plots_combined.R

diff --git a/scripts/plotting/ps_plots_combined.R b/scripts/plotting/ps_plots_combined.R
new file mode 100644
index 0000000..afcdc25
--- /dev/null
+++ b/scripts/plotting/ps_plots_combined.R
@@ -0,0 +1,251 @@
+#!/usr/bin/env Rscript  
+#########################################################
+# TASK: AF, OR and stability plots: PS
+# Output: 1 SVG
+
+#########################################################
+# 				Installing and loading required packages 			 
+##########################################################
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/plotting/")
+getwd()
+
+source("Header_TT.R")
+library(ggplot2)
+library(data.table)
+#source("combining_dfs_plotting.R")
+source("barplot_colour_function.R")
+source("subcols_axis.R")
+
+###########################
+# Data for PS plots
+# you need merged_df3_comp/merged_df_comp
+# since these have unique SNPs
+###########################
+
+no_or_index = which(is.na(my_df_u_cols$or_mychisq))
+
+my_df_u_cols_clean = my_df_u_cols[-no_or_index,]
+
+#%%%%%%%%%%%%%%%%%%%%%%%%
+# REASSIGNMENT
+df = my_df_u_cols_clean
+#%%%%%%%%%%%%%%%%%%%%%%%%%
+cols_to_select = colnames(mut_pos_cols)
+
+mut_pos_cols_clean = df[colnames(df)%in%cols_to_select]
+mut_pos_cols_clean = unique(mut_pos_cols_clean[, 1:length(mut_pos_cols_clean)])
+  
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#===========================
+# Barplot with axis colours:
+#===========================
+#================
+# Inspecting mut_pos_cols
+# position numbers and colours and assigning axis colours based on lab_fg
+# of the correct df
+# open file from desktop ("sample_axis_cols") for cross checking
+#================
+# very important!
+#my_axis_colours = mut_pos_cols$lab_fg
+
+if (nrow(mut_pos_cols_clean) == length(unique(df$position)) ){
+  print("PASS: lengths checked, assigning axis colours")
+  my_axis_colours = mut_pos_cols_clean$lab_fg
+  cat("length of axis colours:", length(my_axis_colours)
+      , "\nwhich corresponds to the number of positions on the x-axis of the plot")
+}else{
+  print("FAIL:lengths mismatch, could not assign axis colours")
+  quit()
+}
+
+# unique positions
+upos = unique(df$position); length(upos)
+
+table(df$duet_outcome)
+
+# add frequency of positions (from lib data.table)
+setDT(df)[, pos_count := .N, by = .(position)]
+
+# this is cummulative
+table(df$pos_count)
+
+# use group by on this
+library(dplyr)
+snpsBYpos_df <- df %>%
+  group_by(position) %>%
+  summarize(snpsBYpos = mean(pos_count))
+
+table(snpsBYpos_df$snpsBYpos)
+
+snp_count = sort(unique(snpsBYpos_df$snpsBYpos))
+
+# sanity checks
+# should be a factor
+if (is.factor(df$duet_outcome)){
+  print("duet_outcome is factor")
+}else{
+  print("converting duet_outcome to factor")
+  df$duet_outcome = as.factor(df$duet_outcome)
+}
+
+is.factor(df$duet_outcome)
+
+# should be -1 and 1
+min(df$duet_scaled)
+max(df$duet_scaled)
+
+# sanity checks
+tapply(df$duet_scaled, df$duet_outcome, min)
+tapply(df$duet_scaled, df$duet_outcome, max)
+
+# My colour FUNCTION: based on group and subgroup
+# in my case;
+# df = df
+# group = duet_outcome
+# subgroup = normalised score i.e duet_scaled
+
+# check unique values in normalised data
+u = unique(df$duet_scaled) 
+cat("No. of unique values in normalised data:", length(u))
+
+# Define group
+# Create an extra column called group which contains the "gp name and score" 
+# so colours can be generated for each unique values in this column
+my_grp = df$duet_scaled # no rounding
+df$group <- paste0(df$duet_outcome, "_", my_grp, sep = "") 
+
+# Call the function to create the palette based on the group defined above
+colours <- ColourPalleteMulti(df, "duet_outcome", "my_grp")
+print(paste0("Colour palette generated for: ", length(colours), " colours"))
+my_title = "Protein stability (DUET)"
+cat("No. of axis colours: ", length(my_axis_colours))
+
+#******************
+# generate plot: barplot unordered by frequency with axis colours
+#******************
+class(df$lab_bg)
+
+# define cartesian coord
+my_xlim = length(unique(df$position)); my_xlim
+
+# axis label size
+my_xals = 18
+my_yals = 18
+
+# axes text size
+my_xats = 14
+my_yats = 18
+
+g3 = ggplot(df, aes(factor(position, ordered = T)))
+
+p3 = g3 + 
+  coord_cartesian(xlim = c(1, my_xlim)
+                  #, ylim = c(0, 6)
+                  , ylim = c(0, max(snp_count))
+                  , clip = "off") +
+  geom_bar(aes(fill = group), colour = "grey") +
+  scale_fill_manual(values = colours
+                    , guide = "none") +
+  geom_tile(aes(,-0.8, width = 0.95, height = 0.85)
+            , fill = df$lab_bg) +
+  geom_tile(aes(,-1.2, width = 0.95, height = -0.2)
+            , fill = df$lab_bg2) +
+  
+  # Here it"s important to specify that your axis goes from 1 to max number of levels
+  theme(axis.text.x = element_text(size = my_xats
+                                   , angle = 90
+                                   , hjust = 1
+                                   , vjust = 0.4
+                                   , colour = my_axis_colours)
+        , axis.text.y = element_text(size = my_yats 
+                                     , angle = 0
+                                     , hjust = 1
+                                     , vjust = 0)
+        , axis.title.x = element_text(size = my_xals)
+        , axis.title.y = element_text(size = my_yals+2 ) 
+        , axis.ticks.x = element_blank()) +
+  labs(title = ""
+       #title = my_title
+       , x = "Position"
+       , y = "Frequency")
+
+p3
+#=================
+# generate plot: AF by position unordered, not coloured
+#=================  
+my_ats = 20 # axis text size
+my_als = 22 # axis label size
+
+g1 = ggplot(df)
+p1 = g1 + 
+  geom_bar(aes(x = factor(position, ordered = T)
+               , y = af*100
+               #, fill = duet_outcome
+               )
+           , color = "black"
+           , fill = "lightgrey"
+           , stat = "identity") +
+  theme(axis.text.x = element_blank()
+         , axis.text.y = element_text(size = my_ats
+                                      , angle = 0
+                                      , hjust = 1
+                                      , vjust = 0)
+         , axis.title.x = element_blank()
+         , axis.title.y = element_text(size = my_als) ) +
+  labs(title = ""
+       #, size = 100
+       #, x = "Wild-type position"
+       , y = "AF(%)")
+       
+p1      
+#=================
+# generate plot: OR by position unordered
+#=================  
+my_ats = 20 # axis text size
+my_als = 22 # axis label size
+
+
+g2 = ggplot(df)
+
+p2 = g2 + 
+  geom_bar(aes(x = factor(position, ordered = T)
+               , y = or_mychisq
+               #, fill = duet_outcome
+               ) 
+           , colour = "black"
+           , fill = "lightgray"
+           , stat = "identity") +
+  theme(axis.text.x = element_blank()
+         , axis.text.y = element_text(size = my_ats
+                                      , angle = 0
+                                      , hjust = 1
+                                      , vjust = 0)
+         , axis.title.x = element_blank()
+         , axis.title.y = element_text(size = my_als) ) +
+  labs(#title = "OR by position"
+       #, x = "Wild-type position"
+       y = "OR") 
+
+p2
+
+########################################################################
+#               			end of DUET barplots         			               #
+########################################################################
+#=======
+# output
+#=======
+ps_plots_combined = "af_or_combined_PS.svg"
+plot_ps_plots_combined  =  paste0(plotdir,"/", ps_plots_combined )
+
+svg(plot_ps_plots_combined , width = 26, height = 12)
+
+#theme_set(theme_gray()) # to preserve default theme
+printFile = cowplot::plot_grid(p1, p2, p3
+                            , ncol = 1
+                            , align = 'v')
+printFile
+dev.off()

From 2c2c2c1a6004106c436e6dba3b2ce9d6cf1854cf Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Wed, 9 Sep 2020 19:03:52 +0100
Subject: [PATCH 24/27] regenerated combined_or figure with correct muts

---
 scripts/plotting/combined_or.R | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/scripts/plotting/combined_or.R b/scripts/plotting/combined_or.R
index 1c167a8..0f76e0e 100644
--- a/scripts/plotting/combined_or.R
+++ b/scripts/plotting/combined_or.R
@@ -108,8 +108,8 @@ g1 = ggplot(ps_df, aes(x = factor(position)
 
 p1 = g1 + 
   geom_point(aes(col = duet_outcome
-                 #, size = or_mychisq))+
-                 , size = or_kin)) +
+                 , size = or_mychisq))+
+                 #, size = or_kin)) + # not good, almost like log(or)
   theme(axis.text.x = element_text(size = my_ats
                                    , angle = 90
                                    , hjust = 1
@@ -148,8 +148,8 @@ g2 = ggplot(lig_df, aes(x = factor(position)
 
 p2 = g2 + 
   geom_point(aes(col = ligand_outcome
-                 #, size = or_mychisq))+
-                 , size = or_kin)) +
+                 , size = or_mychisq))+
+                 #, size = or_kin)) + # not good! almost like log(or)
   theme(axis.text.x = element_text(size = my_ats
                                    , angle = 90
                                    , hjust = 1
@@ -178,7 +178,7 @@ p2
 #======================
 
 svg(plot_or_combined, width = 32, height = 12)
-svg(plot_or_kin_combined, width = 32, height = 12)
+#svg(plot_or_kin_combined, width = 32, height = 12)
 
 theme_set(theme_gray()) # to preserve default theme
 

From f3f86d6651c50ea8ab07075b7b009be033b19717 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Wed, 9 Sep 2020 19:11:06 +0100
Subject: [PATCH 25/27] renamed file

---
 scripts/plotting/{combined_or.R => or_plots_combined.R} | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
 rename scripts/plotting/{combined_or.R => or_plots_combined.R} (98%)

diff --git a/scripts/plotting/combined_or.R b/scripts/plotting/or_plots_combined.R
similarity index 98%
rename from scripts/plotting/combined_or.R
rename to scripts/plotting/or_plots_combined.R
index 0f76e0e..badfda5 100644
--- a/scripts/plotting/combined_or.R
+++ b/scripts/plotting/or_plots_combined.R
@@ -54,8 +54,8 @@ cat(paste0("Variables imported:"
 or_combined = "or_combined_PS_LIG.svg"
 plot_or_combined  =  paste0(plotdir,"/", or_combined)
 
-or_kin_combined = "or_kin_combined_PS_LIG.svg"
-plot_or_kin_combined  =  paste0(plotdir,"/", or_kin_combined)
+#or_kin_combined = "or_kin_combined_PS_LIG.svg"
+#plot_or_kin_combined  =  paste0(plotdir,"/", or_kin_combined)
 #=======================================================================
 
 ###########################

From 9bee97052e1cfe15e43c8b7ec290a0174aa107df Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Wed, 9 Sep 2020 20:48:21 +0100
Subject: [PATCH 26/27] added correlation plots

---
 scripts/plotting/corr_plots.R               | 260 ++++++++++++++++++++
 scripts/plotting/lineage_dist_combined_PS.R |  17 +-
 scripts/plotting/or_plots_combined.R        |   4 +-
 3 files changed, 272 insertions(+), 9 deletions(-)
 create mode 100644 scripts/plotting/corr_plots.R

diff --git a/scripts/plotting/corr_plots.R b/scripts/plotting/corr_plots.R
new file mode 100644
index 0000000..c2d3e17
--- /dev/null
+++ b/scripts/plotting/corr_plots.R
@@ -0,0 +1,260 @@
+#!/usr/bin/env Rscript
+#########################################################
+# TASK: Corr plots for PS and Lig 
+
+# Output: 1 svg
+
+#=======================================================================
+# working dir and loading libraries
+getwd()
+setwd("~/git/LSHTM_analysis/scripts/plotting/")
+getwd()
+
+
+source("Header_TT.R")
+require(cowplot)
+source("combining_dfs_plotting.R")
+# should return the following dfs, directories and variables
+
+# PS combined: 
+# 1) merged_df2
+# 2) merged_df2_comp
+# 3) merged_df3
+# 4) merged_df3_comp
+
+# LIG combined: 
+# 5) merged_df2_lig
+# 6) merged_df2_comp_lig
+# 7) merged_df3_lig
+# 8) merged_df3_comp_lig
+
+# 9) my_df_u
+# 10) my_df_u_lig
+
+cat(paste0("Directories imported:"
+           , "\ndatadir:", datadir
+           , "\nindir:", indir
+           , "\noutdir:", outdir
+           , "\nplotdir:", plotdir))
+
+cat(paste0("Variables imported:"
+           , "\ndrug:", drug
+           , "\ngene:", gene
+           , "\ngene_match:", gene_match
+           , "\nAngstrom symbol:", angstroms_symbol
+           , "\nNo. of duplicated muts:", dup_muts_nu
+           , "\nNA count for ORs:", na_count
+           , "\nNA count in df2:", na_count_df2
+           , "\nNA count in df3:", na_count_df3))     
+
+#=======
+# output
+#=======
+# can't combine by cowplot because not ggplots
+#corr_plot_combined = "corr_combined.svg"
+#plot_corr_plot_combined  =  paste0(plotdir,"/", corr_plot_combined)
+
+# PS
+corr_ps = "corr_PS.svg"
+plot_corr_ps =  paste0(plotdir,"/", corr_ps)
+
+# LIG
+corr_lig = "corr_LIG.svg"
+plot_corr_lig =  paste0(plotdir,"/", corr_lig)
+
+####################################################################
+#               end of loading libraries and functions                 #
+########################################################################
+
+#%%%%%%%%%%%%%%%%%%%%%%%%%
+df_ps = merged_df3_comp 
+df_lig = merged_df3_comp_lig
+#%%%%%%%%%%%%%%%%%%%%%%%%%
+
+rm( merged_df2, merged_df2_comp, merged_df2_lig, merged_df2_comp_lig, my_df_u, my_df_u_lig)
+
+########################################################################
+#               end of data extraction and cleaning for plots          #
+########################################################################
+
+#===========================
+# Data for Correlation plots:PS
+#===========================
+# adding log cols 
+df_ps$log10_or_mychisq = log10(df_ps$or_mychisq)
+df_ps$neglog_pval_fisher = -log10(df_ps$pval_fisher)
+
+df_ps$log10_or_kin = log10(df_ps$or_kin)
+df_ps$neglog_pwald_kin = -log10(df_ps$pwald_kin)
+
+# subset data to generate pairwise correlations
+cols_to_select =  c("duet_scaled"
+                    
+                    , "log10_or_mychisq"
+                    , "neglog_pval_fisher"
+                    
+                    , "log10_or_kin"
+                    , "neglog_pwald_kin"
+                    
+                    , "af"
+                    
+                    , "duet_outcome"
+                    , drug)
+
+corr_data_ps = df_ps[, cols_to_select]
+
+dim(corr_data_ps)
+
+#p_italic = substitute(paste("-Log(", italic('P'), ")"));p_italic 
+#p_adjusted_italic = substitute(paste("-Log(", italic('P adjusted'), ")"));p_adjusted_italic
+
+# assign nice colnames (for display)
+my_corr_colnames = c("DUET"
+                     
+                     , "Log(OR)"
+                     , "-Log(P)"
+                     
+                     , "Log(OR adjusted)"
+                     , "-Log(P wald)"
+                     
+                     , "AF"
+                     
+                     , "duet_outcome"
+                     , drug)
+
+length(my_corr_colnames)
+
+colnames(corr_data_ps)
+colnames(corr_data_ps) <- my_corr_colnames
+colnames(corr_data_ps)
+
+#-----------------
+# generate corr PS plot
+#-----------------
+start = 1
+end = which(colnames(corr_data_ps) == drug); end # should be the last column
+offset = 1
+
+my_corr_ps = corr_data_ps[start:(end-offset)]
+head(my_corr_ps)
+
+#my_cols = c("#f8766d", "#00bfc4")
+# deep blue :#007d85
+# deep red: #ae301e
+
+cat("Corr plot PS:", plot_corr_ps)
+svg(plot_corr_ps, width = 15, height = 15)
+
+OutPlot1 = pairs.panels(my_corr_ps[1:(length(my_corr_ps)-1)]
+             , method = "spearman" # correlation method
+             , hist.col = "grey" ##00AFBB
+             , density = TRUE  # show density plots
+             , ellipses = F # show correlation ellipses
+             , stars = T
+             , rug = F
+             , breaks = "Sturges"
+             , show.points = T
+             , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr_ps$duet_outcome))]
+             , pch = 21
+             , jitter = T
+             #, alpha = .05
+             #, points(pch = 19, col = c("#f8766d", "#00bfc4"))
+             , cex = 3
+             , cex.axis = 2.5
+             , cex.labels = 2.1
+             , cex.cor = 1
+             , smooth = F
+)
+
+print(OutPlot1)
+dev.off()
+
+#===========================
+# Data for Correlation plots: LIG
+#===========================
+
+df_lig$log10_or_mychisq = log10(df_lig$or_mychisq)
+df_lig$neglog_pval_fisher = -log10(df_lig$pval_fisher)
+
+
+df_lig$log10_or_kin = log10(df_lig$or_kin)
+df_lig$neglog_pwald_kin = -log10(df_lig$pwald_kin)
+
+
+# subset data to generate pairwise correlations
+cols_to_select =  c("affinity_scaled"
+                    
+                    , "log10_or_mychisq"
+                    , "neglog_pval_fisher"
+                    
+                    , "log10_or_kin"
+                    , "neglog_pwald_kin"
+                    
+                    , "af"
+                    
+                    , "ligand_outcome"
+                    , drug)
+
+corr_data_lig = df_lig[, cols_to_select]
+
+
+dim(corr_data_lig)
+
+# assign nice colnames (for display)
+my_corr_colnames = c("Ligand Affinity"
+                     
+                     , "Log(OR)"
+                     , "-Log(P)"
+                     
+                     , "Log(OR adjusted)"
+                     , "-Log(P wald)"
+                     
+                     , "AF"
+                     
+                     , "ligand_outcome"
+                     , drug)
+
+length(my_corr_colnames)
+
+colnames(corr_data_lig)
+colnames(corr_data_lig) <- my_corr_colnames
+colnames(corr_data_lig)
+
+#-----------------
+# generate corr LIG plot
+#-----------------
+
+start = 1
+end = which(colnames(corr_data_lig) == drug); end # should be the last column
+offset = 1
+
+my_corr_lig = corr_data_lig[start:(end-offset)]
+head(my_corr_lig)
+
+cat("Corr LIG plot:", plot_corr_lig)
+svg(plot_corr_lig, width = 15, height = 15)
+
+OutPlot2  = pairs.panels(my_corr_lig[1:(length(my_corr_lig)-1)]
+                  , method = "spearman" # correlation method
+                  , hist.col = "grey" ##00AFBB
+                  , density = TRUE  # show density plots
+                  , ellipses = F # show correlation ellipses
+                  , stars = T
+                  , rug = F
+                  , breaks = "Sturges"
+                  , show.points = T
+                  , bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr_lig$ligand_outcome))]
+                  , pch = 21
+                  , jitter = T
+                  #, alpha = .05
+                  #, points(pch = 19, col = c("#f8766d", "#00bfc4"))
+                  , cex = 3
+                  , cex.axis = 2.5
+                  , cex.labels = 2.1
+                  , cex.cor = 1
+                  , smooth = F
+)
+
+print(OutPlot2)
+dev.off()
+#######################################################
diff --git a/scripts/plotting/lineage_dist_combined_PS.R b/scripts/plotting/lineage_dist_combined_PS.R
index d62b038..f59cf44 100644
--- a/scripts/plotting/lineage_dist_combined_PS.R
+++ b/scripts/plotting/lineage_dist_combined_PS.R
@@ -53,6 +53,12 @@ cat("Variables imported:"
     , "\nother_muts_col:", other_muts_col
     , "\ndrtype_col:", resistance_col)
 
+#=======
+# output
+#=======
+lineage_dist_combined = "lineage_dist_combined_PS.svg"
+plot_lineage_dist_combined  =  paste0(plotdir,"/", lineage_dist_combined)
+#========================================================================
 
 ###########################
 # Data for plots
@@ -281,13 +287,10 @@ p2 = ggplot(df_dr, aes(x = duet_scaled
 
 print(p2)
 #dev.off()
-#====================
-#combine
-#=======
-# output
-#=======
-lineage_dist_combined = "lineage_dist_combined_PS.svg"
-plot_lineage_dist_combined  =  paste0(plotdir,"/", lineage_dist_combined)
+########################################################################
+#==============
+# combine plot
+#===============
 
 svg(plot_lineage_dist_combined, width = 12, height = 6)
 
diff --git a/scripts/plotting/or_plots_combined.R b/scripts/plotting/or_plots_combined.R
index badfda5..a664f1b 100644
--- a/scripts/plotting/or_plots_combined.R
+++ b/scripts/plotting/or_plots_combined.R
@@ -1,8 +1,8 @@
 #!/usr/bin/env Rscript
 #########################################################
-# TASK: Basic lineage barplot showing numbers
+# TASK: Bubble plot of OR for PS and Lig 
 
-# Output: Basic barplot with lineage samples and mut count
+# Output: 1 svg
 
 #=======================================================================
 # working dir and loading libraries

From fc47c58f91733fdf99365066a45f10ac62788e53 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Wed, 9 Sep 2020 20:56:07 +0100
Subject: [PATCH 27/27] saving work after correlation plots

---
 scripts/plotting/corr_plots.R | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/scripts/plotting/corr_plots.R b/scripts/plotting/corr_plots.R
index c2d3e17..00c2c5a 100644
--- a/scripts/plotting/corr_plots.R
+++ b/scripts/plotting/corr_plots.R
@@ -80,6 +80,8 @@ rm( merged_df2, merged_df2_comp, merged_df2_lig, merged_df2_comp_lig, my_df_u, m
 #===========================
 # Data for Correlation plots:PS
 #===========================
+table(df_ps$duet_outcome)
+
 # adding log cols 
 df_ps$log10_or_mychisq = log10(df_ps$or_mychisq)
 df_ps$neglog_pval_fisher = -log10(df_ps$pval_fisher)
@@ -172,6 +174,7 @@ dev.off()
 #===========================
 # Data for Correlation plots: LIG
 #===========================
+table(df_lig$ligand_outcome)
 
 df_lig$log10_or_mychisq = log10(df_lig$or_mychisq)
 df_lig$neglog_pval_fisher = -log10(df_lig$pval_fisher)