From 2c013124ad9b59e1d4a43c5b9facfe5d5c2fcbee Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Mon, 21 Sep 2020 17:54:54 +0100
Subject: [PATCH] updated gitignore to tidyup

---
 .gitignore                                    |   1 +
 meta_data_analysis/AF_and_OR_calcs.R          | 398 ---------------
 meta_data_analysis/combine_dfs.py             | 248 ----------
 meta_data_analysis/combining_df_lig.R         | 397 ---------------
 meta_data_analysis/combining_df_ps.R          | 461 ------------------
 meta_data_analysis/dssp_df.py                 | 170 -------
 meta_data_analysis/init_data_dirs.py          |  36 --
 meta_data_analysis/kd_df.py                   | 193 --------
 .../mut_electrostatic_changes.py              | 167 -------
 meta_data_analysis/mutate.py                  | 179 -------
 meta_data_analysis/rd_df.py                   | 135 -----
 meta_data_analysis/run_mutate.sh              |  18 -
 meta_data_analysis/run_pdb_dssp.sh            |  54 --
 plotting_test/logo_plot.R                     | 221 ---------
 14 files changed, 1 insertion(+), 2677 deletions(-)
 delete mode 100644 meta_data_analysis/AF_and_OR_calcs.R
 delete mode 100755 meta_data_analysis/combine_dfs.py
 delete mode 100644 meta_data_analysis/combining_df_lig.R
 delete mode 100644 meta_data_analysis/combining_df_ps.R
 delete mode 100755 meta_data_analysis/dssp_df.py
 delete mode 100755 meta_data_analysis/init_data_dirs.py
 delete mode 100644 meta_data_analysis/kd_df.py
 delete mode 100755 meta_data_analysis/mut_electrostatic_changes.py
 delete mode 100644 meta_data_analysis/mutate.py
 delete mode 100755 meta_data_analysis/rd_df.py
 delete mode 100755 meta_data_analysis/run_mutate.sh
 delete mode 100755 meta_data_analysis/run_pdb_dssp.sh
 delete mode 100644 plotting_test/logo_plot.R

diff --git a/.gitignore b/.gitignore
index 2f8afad..c1ac04d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,7 @@
 __pycache__
 */__pycache__
 mcsm_analysis_fixme
+meta_data_analysis
 del
 examples
 example
diff --git a/meta_data_analysis/AF_and_OR_calcs.R b/meta_data_analysis/AF_and_OR_calcs.R
deleted file mode 100644
index f5c9db6..0000000
--- a/meta_data_analysis/AF_and_OR_calcs.R
+++ /dev/null
@@ -1,398 +0,0 @@
-#########################################################
-# TASK: To calculate Allele Frequency and
-# Odds Ratio from master data
-# and add the calculated params to meta_data extracted from
-# data_extraction.py
-#########################################################
-getwd()
-setwd('~/git/LSHTM_analysis/meta_data_analysis')
-getwd()
-  
-#%% variable assignment: input and output paths & filenames
-drug = 'pyrazinamide'
-gene = 'pncA'
-gene_match = paste0(gene,'_p.')
-cat(gene_match)
-
-#===========
-# input
-#===========
-# infile1: Raw data
-#indir = 'git/Data/pyrazinamide/input/original'
-indir = paste0('~/git/Data')
-in_filename  = 'original_tanushree_data_v2.csv'
-infile = paste0(indir, '/', in_filename)
-cat(paste0('Reading infile1: raw data', ' ', infile) )
-
-# infile2: gene associated meta data file to extract valid snps and add calcs to.
-# This is outfile3 from data_extraction.py
-indir_metadata = paste0('~/git/Data', '/', drug, '/', 'output')
-in_filename_metadata = 'pnca_metadata.csv'
-infile_metadata = paste0(indir_metadata, '/', in_filename_metadata)
-cat(paste0('Reading infile2: gene associated metadata:', infile_metadata))
-
-#===========
-# output
-#===========
-# outdir = 'git/Data/pyrazinamide/output'
-outdir = paste0('~/git/Data', '/', drug, '/', 'output')
-out_filename = paste0(tolower(gene),'_', 'meta_data_with_AFandOR.csv')
-outfile = paste0(outdir, '/', out_filename)
-cat(paste0('Output file with full path:', outfile))
-#%% end of variable assignment for input and output files
-
-#########################################################
-# 1: Read master/raw data stored in Data/
-#########################################################
-raw_data_all = read.csv(infile, stringsAsFactors = F)
-
-raw_data = raw_data_all[,c("id"
-                     , "pyrazinamide"
-                     , "dr_mutations_pyrazinamide"
-                     , "other_mutations_pyrazinamide")]
-rm(raw_data_all)
-
-rm(indir, in_filename, infile)
-
-#===========
-# 1a: exclude na
-#===========
-raw_data = raw_data[!is.na(raw_data$pyrazinamide),]
-
-total_samples = length(unique(raw_data$id))
-cat(paste0('Total samples without NA in', ' ', drug, 'is:', total_samples))
-
-# sanity check: should  be true
-is.numeric(total_samples) 
-
-#===========
-# 1b: combine the two mutation columns
-#===========
-raw_data$all_mutations_pyrazinamide = paste(raw_data$dr_mutations_pyrazinamide
-                                            , raw_data$other_mutations_pyrazinamide)
-head(raw_data$all_mutations_pyrazinamide)
-
-#===========
-# 1c: create yet another column that contains all the mutations but in lower case
-#===========
-raw_data$all_muts_pnca = tolower(raw_data$all_mutations_pyrazinamide) 
-
-# sanity checks
-#table(grepl("pnca_p",raw_data$all_muts_pnca))
-cat(paste0('converting gene match:', gene_match, ' ', 'to lowercase'))
-gene_match = tolower(gene_match)
-
-table(grepl(gene_match,raw_data$all_muts_pnca))
-
-# sanity check: should be TRUE
-#sum(table(grepl("pnca_p",raw_data$all_muts_pnca))) == total_samples
-sum(table(grepl(gene_match,raw_data$all_muts_pnca))) == total_samples
-
-# set up variables: can be used for logistic regression as well
-i  = "pnca_p.ala134gly" # has a NA, should NOT exist
-table(grepl(i,raw_data$all_muts_pnca))
-
-i = "pnca_p.trp68gly"
-table(grepl(i,raw_data$all_muts_pnca))
-
-mut = grepl(i,raw_data$all_muts_pnca)
-dst = raw_data$pyrazinamide
-table(mut, dst)
-
-#chisq.test(table(mut,dst))
-#fisher.test(table(mut, dst))
-#table(mut)
-
-#########################################################
-# 2: Read valid snps for which OR 
-# can be calculated (infile_comp_snps.csv)
-#########################################################
-cat(paste0('Reading metadata infile:', infile_metadata))
-
-pnca_metadata = read.csv(infile_metadata
-#                        , file.choose()
-                        , stringsAsFactors = F
-                        , header = T)
-
-
-# clear variables
-rm(indir, in_filename, infile)
-rm(indir_metadata, in_filename_metadata, infile_metadata)
-
-# count na in pyrazinamide column
-tot_pza_na = sum(is.na(pnca_metadata$pyrazinamide))
-expected_rows = nrow(pnca_metadata) - tot_pza_na
-
-# drop na from the pyrazinamide colum
-pnca_snps_or = pnca_metadata[!is.na(pnca_metadata$pyrazinamide),]
-
-# sanity check
-if(nrow(pnca_snps_or) == expected_rows){
-  cat('PASS: no. of rows match with expected_rows')
-} else{
-  cat('FAIL: nrows mismatch.')
-}
-
-# extract unique snps to iterate over for AF and OR calcs
-pnca_snps_unique = unique(pnca_snps_or$mutation) 
-
-cat(paste0('Total no. of distinct comp snps to perform OR calcs: ', length(pnca_snps_unique)))
-
-# Define OR function
-x = as.numeric(mut)
-y = dst
-or = function(x,y){
-  tab = as.matrix(table(x,y))
-  a = tab[2,2]
-  if (a==0){ a<-0.5}
-  b = tab[2,1]
-  if (b==0){ b<-0.5}
-  c = tab[1,2]
-  if (c==0){ c<-0.5}
-  d = tab[1,1]
-  if (d==0){ d<-0.5}
-  (a/b)/(c/d)
-  
-  }
-
-dst = raw_data$pyrazinamide
-ors = sapply(pnca_snps_unique,function(m){
-  mut = grepl(m,raw_data$all_muts_pnca)
-  or(mut,dst)
-})
-
-ors
-
-pvals = sapply(pnca_snps_unique,function(m){
-  mut = grepl(m,raw_data$all_muts_pnca)
-  fisher.test(mut,dst)$p.value
-})
-
-pvals
-
-afs = sapply(pnca_snps_unique,function(m){
-  mut = grepl(m,raw_data$all_muts_pnca)
-  mean(mut)
-})
-
-afs
-
-# check ..hmmm
-afs['pnca_p.trp68gly']
-afs['pnca_p.gln10pro'] 
-afs['pnca_p.leu4ser'] 
-
-plot(density(log(ors)))
-plot(-log10(pvals))
-hist(log(ors)
-     , breaks = 100
-     )
-
-# sanity check
-if (table(names(ors) ==  names(pvals)) & table(names(ors) == names(afs)) & table(names(pvals) == names(afs)) == length(pnca_snps_unique)){
-cat('PASS: names of ors, pvals and afs match: proceed with combining into a single df')
-} else{
-  cat('FAIL: names of ors, pvals and afs mismatch')
-}
-
-# combine ors, pvals and afs
-cat('Combining calculated params into a df: ors, pvals and afs')
-
-comb_AF_and_OR = data.frame(ors, pvals, afs)
-cat('No. of rows in comb_AF_and_OR: ', nrow(comb_AF_and_OR)
-    , '\nNo. of cols in comb_AF_and_OR: ', ncol(comb_AF_and_OR))
-
-cat('Rownames == mutation: ', head(rownames(comb_AF_and_OR)))
-
-# add rownames of comb_AF_and_OR as an extra column 'mutation' to allow merging based on this column
-comb_AF_and_OR$mutation = rownames(comb_AF_and_OR)
-
-# sanity check
-if (table(rownames(comb_AF_and_OR) == comb_AF_and_OR$mutation)){
-  cat('PASS: rownames and mutaion col values match')
-}else{
-  cat('FAIL: rownames and mutation col values mismatch')
-}
-
-#########################################################
-# 3: Merge meta data file + calculated num params
-#########################################################
-df1 = pnca_metadata
-df2 = comb_AF_and_OR
-
-cat('checking commom col of the two dfs before merging:'
-    ,'\ndf1:', head(df1$mutation)
-    , '\ndf2:', head(df2$mutation))
-
-cat(paste0('merging two dfs: '
-      ,'\ndf1 (big df i.e. meta data) nrows: ', nrow(df1)
-      ,'\ndf2 (small df i.e af, or, pval) nrows: ', nrow(df2)
-      ,'\nexpected rows in merged df: ', nrow(df1)
-      ,'\nexpected cols in merged_df: ', (ncol(df1) + ncol(df2) - 1)))
-
-merged_df = merge(df1 # big file
-                  , df2 # small (afor file)
-                  , by = "mutation"
-                  , all.x = T) # because you want all the entries of the meta data 
-
-# sanity check
-if(ncol(merged_df) == (ncol(df1) + ncol(df2) - 1)){
-  cat(paste0('PASS: no. of cols is as expected: ', ncol(merged_df)))
-} else{
-  cat('FAIL: no.of cols mistmatch')
-}
-
-# quick check
-i  = "pnca_p.ala134gly" # has all NAs in pyrazinamide, should be NA in ors, etc.
-merged_df[merged_df$mutation == i,]
-
-# count na in each column
-na_count = sapply(merged_df, function(y) sum(length(which(is.na(y))))); na_count
-
-# check last three cols: should be NA
-if ( identical(na_count[[length(na_count)]], na_count[[length(na_count)-1]], na_count[[length(na_count)-2]])){
-  cat('PASS: No. of NAs for OR, AF and Pvals are equal as expected',
-      '\nNo. of NA: ', na_count[[length(na_count)]])
-} else {
-  cat('FAIL: No. of NAs for OR, AF and Pvals mismatch')
-}
-
-# reassign custom colnames
-cat('Assigning custom colnames for the calculated params...')
-colnames(merged_df)[colnames(merged_df)== "ors"] <- "OR"
-colnames(merged_df)[colnames(merged_df)== "pvals"] <- "pvalue"
-colnames(merged_df)[colnames(merged_df)== "afs"] <- "AF"
-
-colnames(merged_df)
-
-# add 3 more cols: log OR, neglog pvalue and AF_percent cols
-merged_df$logor = log(merged_df$OR)
-is.numeric(merged_df$logor)
-
-merged_df$neglog10pvalue = -log10(merged_df$pvalue) 
-is.numeric(merged_df$neglog10pvalue)
-
-merged_df$AF_percent = merged_df$AF*100
-is.numeric(merged_df$AF_percent)
-
-# check AFs
-#i = 'pnca_p.trp68gly'
-i = 'pnca_p.gln10pro' 
-#i = 'pnca_p.leu4ser'
-merged_df[merged_df$mutation == i,]
-
-# FIXME: harcoding (beware!), NOT FATAL though!
-ncol_added = 3 
-
-cat(paste0('Added', ' ', ncol_added, ' more cols to merged_df:'
-          , '\ncols added: logor, neglog10pvalue and AF_percent:'
-          , '\nno. of cols in merged_df now: ', ncol(merged_df)))
-
-#%% write file out: pnca_meta_data_with_AFandOR
-#*********************************************
-cat(paste0('writing output file: '
-             , '\nFilename: ', out_filename
-             , '\nPath:', outdir))
-
-write.csv(merged_df, outfile
-          , row.names = F)
-
-cat(paste0('Finished writing:'
-           , out_filename
-           , '\nNo. of rows: ', nrow(merged_df)
-           , '\nNo. of cols: ', ncol(merged_df)))
-#************************************************
-cat('======================================================================')
-rm(out_filename)
-cat('End of script: calculated AF, OR, pvalues and saved file')
-# End of script
-#%%
-# sanity check: Count NA in these four cols.
-# However these need to be numeric else these
-# will be misleading when counting NAs (i.e retrun 0)
-#is.numeric(meta_with_afor$OR)
-na_var = c('AF', 'OR', 'pvalue', 'logor', 'neglog10pvalue')
-
-# loop through these vars and check if these are numeric.
-# if not, then convert to numeric
-check_all = NULL
-
-for (i in na_var){
-  #    cat(i)
-  check0 = is.numeric(meta_with_afor[,i])
-  if (check0) {
-    check_all = c(check0, check_all)
-    cat('These are all numeric cols')  
-  } else{
-    cat('First converting to numeric')
-    check0 = as.numeric(meta_with_afor[,i])
-    check_all = c(check0, check_all)
-  }
-}
-
-# count na now that the respective cols are numeric
-na_count = sapply(meta_with_afor, function(y) sum(length(which(is.na(y))))); na_count
-str(na_count)
-
-# extract how many NAs: 
-# should be all TRUE 
-# should be a single number since 
-# all the cols should have 'equal' and 'same' no. of NAs
-# compare if the No of 'NA' are the same for all these cols
-na_len = NULL
-for (i in na_var){
-  temp = na_count[[i]]
-  na_len = c(na_len, temp)
-}
-
-cat('Checking how many NAs and if these are identical for the selected cols:')
-my_nrows = NULL
-for ( i in 1: (length(na_len)-1) ){
-# cat(compare(na_len[i]), na_len[i+1])
-  c = compare(na_len[i], na_len[i+1])
-  if ( c$result ) {
-    cat('PASS: No. of NAa in selected cols are identical')
-    my_nrows = na_len[i] }
-  else { 
-  cat('FAIL: No. of NAa in selected cols mismatch') 
-  }
-}
-
-cat('No. of NAs in each of the selected cols: ', my_nrows)
-
-# yet more sanity checks:
-cat('Check whether the ', my_nrows, 'indices are indeed the same')
-
-#which(is.na(meta_with_afor$OR)) 
-
-# initialise an empty df with nrows as extracted above
-na_count_df = data.frame(matrix(vector(mode = 'numeric'
-#                                       , length = length(na_var)
-                                       )
-                                , nrow = my_nrows
-#                                , ncol = length(na_var)
-                              ))
-
-# populate the df with the indices of the cols that are NA
-for (i in na_var){
-  cat(i)
-  na_i = which(is.na(meta_with_afor[i]))
-  na_count_df = cbind(na_count_df, na_i)
-  colnames(na_count_df)[which(na_var == i)] <- i
-}
-
-# Now compare these indices to ensure these are the same
-check2 = NULL
-for ( i in 1: ( length(na_count_df)-1 ) ) {
-#  cat(na_count_df[i] == na_count_df[i+1])
-  check_all = identical(na_count_df[[i]], na_count_df[[i+1]])
-  check2 = c(check_all, check2)
-  if ( all(check2) ) {
-    cat('PASS: The indices for AF, OR, etc are all the same\n')
-  } else {
-    cat ('FAIL: Please check indices which are NA')
-  }
-}
-
-
-
diff --git a/meta_data_analysis/combine_dfs.py b/meta_data_analysis/combine_dfs.py
deleted file mode 100755
index f20215d..0000000
--- a/meta_data_analysis/combine_dfs.py
+++ /dev/null
@@ -1,248 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-'''
-Created on Tue Aug  6 12:56:03 2019
-
-@author: tanu
-'''
-# FIXME: change filename 4 (mcsm normalised data)
-# to be consistent like (pnca_complex_mcsm_norm.csv) : changed manually, but ensure this is done in the mcsm pipeline
-#=============================================================================
-# Task: combine 4 dfs with aa position as linking column
-# This is done in 2 steps:
-# merge 1: of 3 dfs (filenames in lowercase)
-#   <gene.lower()>_dssp.csv
-#   <gene.lower()>_kd.csv
-#   <gene.lower()>_pnca_rd.csv
-
-# merge 2: of 2 dfs
-#    pnca_complex_mcsm_norm.csv (!fix name in mcsm script)
-#    output df from merge1
-
-# Input: 3 dfs
-#   <gene.lower()>_dssp.csv
-#   <gene.lower()>_kd.csv
-#   <gene.lower()>_pnca_rd.csv
-# pnca_complex_mcsm_norm.csv (!fix name in mcsm script)
-
-# Output: .csv of all 4 dfs combined
-
-# useful link
-# https://stackoverflow.com/questions/23668427/pandas-three-way-joining-multiple-dataframes-on-columns
-#=============================================================================
-#%% load packages
-import sys, os
-import pandas as pd
-#import numpy as np
-import argparse
-#=======================================================================
-#%% specify input and curr dir
-homedir = os.path.expanduser('~')
-
-# set working dir
-os.getcwd()
-os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
-os.getcwd()
-#=======================================================================
-#%% command line args
-arg_parser = argparse.ArgumentParser()
-#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide')
-#arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive
-arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazin')
-arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pn') # case sensitive
-args = arg_parser.parse_args()
-#=======================================================================
-#%% variable assignment: input and output 
-#drug = 'pyrazinamide'
-#gene = 'pncA'
-#gene_match = gene + '_p.'
-
-drug = args.drug
-gene = args.gene
-#==========
-# data dir
-#==========
-datadir = homedir + '/' + 'git/Data'
-
-#=======
-# input
-#=======
-indir = datadir + '/' + drug + '/' + 'output'
-in_filename1 = 'pnca_dssp.csv'
-in_filename2 = 'pnca_kd.csv'
-in_filename3 = 'pnca_rd.csv'
-#in_filename4 = 'mcsm_complex1_normalised.csv' # Fix name in mcsm script
-in_filename4 = 'pnca_complex_mcsm_norm.csv'  # manually changed temporarily
-
-infile1 = indir + '/' + in_filename1
-infile2 = indir + '/' + in_filename2
-infile3 = indir + '/' + in_filename3
-infile4 = indir + '/' + in_filename4
-
-print('\nInput path:', indir
-      , '\nInput filename1:', in_filename1
-      , '\nInput filename2:', in_filename2
-      , '\nInput filename3:', in_filename3
-      , '\nInput filename4:', in_filename4
-      , '\n===================================================================')
-
-#=======
-# output 
-#=======
-outdir =   datadir + '/' + drug + '/' + 'output'
-out_filename = gene.lower() + '_mcsm_struct_params.csv'
-outfile =  outdir + '/' + out_filename
-print('Output filename:', out_filename
-      , '\nOutput path:', outdir
-      , '\n===================================================================')
-
-#%% end of variable assignment for input and output files
-#=======================================================================
-#%%  Read input file
-dssp_df  = pd.read_csv(infile1, sep = ',')  
-kd_df  = pd.read_csv(infile2, sep = ',')  
-rd_df  = pd.read_csv(infile3, sep = ',')
-mcsm_df  = pd.read_csv(infile4, sep = ',')
-
-print('Reading input files:'
-      , '\ndssp file:', infile1
-      , '\nNo. of rows:', len(dssp_df)
-      , '\nNo. of cols:', len(dssp_df.columns)
-      , '\nColumn names:', dssp_df.columns
-      , '\n==================================================================='
-      , '\nkd file:', infile2
-      , '\nNo. of rows:', len(kd_df)
-      , '\nNo. of cols:', len(kd_df.columns)
-      , '\nColumn names:', kd_df.columns
-      , '\n==================================================================='
-      , '\nrd file:', infile3
-      , '\nNo. of rows:', len(rd_df)
-      , '\nNo. of cols:', len(rd_df.columns)
-      , '\nColumn names:', rd_df.columns
-      , '\n==================================================================='
-      , '\nrd file:', infile4
-      , '\nNo. of rows:', len(mcsm_df)
-      , '\nNo. of cols:', len(mcsm_df.columns)
-      , '\nColumn names:', mcsm_df.columns
-      , '\n===================================================================')
-#%% Begin combining dfs
-#===================
-# concatenating df1 (3dfs): dssp_df + kd_df+ rd_df
-#===================
-print('starting first merge...\n')
-
-# checking no. of rows
-print('Checking if no. of rows of the 3 dfs are equal:\n'
-      , len(dssp_df) == len(kd_df) == len(rd_df)
-      , '\nReason: fasta files and pdb files vary since not all pos are part of the structure'
-      , '\n===================================================================')
-
-# variables for sanity checks
-expected_rows_df1 = max(len(dssp_df), len(kd_df), len(rd_df))
-# beware of harcoding! used for sanity check
-ndfs = 3
-ncol_merge = 1
-offset = ndfs- ncol_merge
-expected_cols_df1 = len(dssp_df.columns) + len(kd_df.columns) + len(rd_df.columns) - offset
-
-print('Merge 1:'
-      , '\ncombining 3dfs by commom col: position'
-      , '\nExpected nrows in combined_df:', expected_rows_df1
-      , '\nExpected ncols in combined_df:', expected_cols_df1
-      , '\nResetting the common col as the index'
-      , '\n===================================================================')
-
-#dssp_df.set_index('position', inplace = True)
-#kd_df.set_index('position', inplace = True)
-#rd_df.set_index('position', inplace =True)
-
-#combined_df = pd.concat([dssp_df, kd_df, rd_df], axis = 1, sort = False).reset_index()
-#combined_df.rename(columns = {'index':'position'})
-
-combined_df1  = pd.concat(
-    (my_index.set_index('position') for my_index in [dssp_df, kd_df, rd_df])
-    , axis = 1, join = 'outer').reset_index()
-
-# sanity check
-print('Checking dimensions of concatenated df1...')
-if len(combined_df1) == expected_rows_df1 and len(combined_df1.columns) == expected_cols_df1:
-    print('PASS: combined df has expected dimensions'
-          , '\nNo. of rows in combined df:', len(combined_df1)
-          , '\nNo. of cols in combined df:', len(combined_df1.columns)
-          , '\n===============================================================')
-else:
-    print('FAIL: combined df does not have expected dimensions'
-          , '\nNo. of rows in combined df:', len(combined_df1)
-          , '\nNo. of cols in combined df:', len(combined_df1.columns)
-          , '\n===============================================================')
-
-#===================
-# concatenating df2 (2dfs): mcsm_df + combined_df1
-# sort sorts the cols
-#===================
-print('starting second merge...\n')
-
-# rename col 'Position' in mcsm_df to lowercase 'position'
-# as it matches the combined_df1 colname to perfom merge
-#mcsm_df.columns
-#mcsm_df.rename(columns = {'Position':'position'}) # not working!
-# copy 'Position' column with the correct colname 
-print('Firstly, copying \'Position\' col and renaming \'position\' to allow merging'
-      , '\nNo. of cols before copying: ', len(mcsm_df.columns))
-
-mcsm_df['position'] = mcsm_df['Position']
-print('No. of cols after copying: ', len(mcsm_df.columns))
-
-# sanity check
-if mcsm_df['position'].equals(mcsm_df['Position']):
-    print('PASS: Copying worked correctly'
-          , '\ncopied col matches original column'
-          , '\n===============================================================')
-else:
-    print('FAIL: copied col does not match original column'
-         , '\n================================================================')
-
-# variables for sanity checks
-expected_rows_df2 = len(mcsm_df)
-# beware of harcoding! used for sanity check
-ndfs = 2
-ncol_merge = 1
-offset = ndfs - ncol_merge
-expected_cols_df2 = len(mcsm_df.columns) + len(combined_df1.columns) - offset
-
-print('Merge 2:'
-      , '\ncombining 2dfs by commom col: position'
-      , '\nExpected nrows in combined_df:', expected_rows_df2
-      , '\nExpected ncols in combined_df:', expected_cols_df2
-      , '\n===================================================================')
-
-combined_df2 = mcsm_df.merge(combined_df1, on = 'position')
-
-# sanity check
-print('Checking dimensions of concatenated df2...')
-if len(combined_df2) == expected_rows_df2 and len(combined_df2.columns) == expected_cols_df2:
-    print('PASS: combined df2 has expected dimensions'
-          , '\nNo. of rows in combined df:', len(combined_df2)
-          , '\nNo. of cols in combined df:', len(combined_df2.columns)
-          , '\n===============================================================')
-else:
-    print('FAIL: combined df2 does not have expected dimensions'
-          , '\nNo. of rows in combined df:', len(combined_df2)
-          , '\nNo. of cols in combined df:', len(combined_df2.columns)
-          , '\n===============================================================')
-    
-#%% write file
-print('Writing file:'
-      , '\nFilename:', out_filename
-      , '\nPath:',  outdir
-      , '\n===================================================================')
-
-combined_df2.to_csv(outfile, header = True, index = False)
-
-print('Finished writing:', out_filename
-      , '\nNo. of rows:', len(combined_df2)
-      , '\nNo. of cols:', len(combined_df2.columns)
-      , '\n===================================================================')
-
-#%% end of script
-#==============================================================================
diff --git a/meta_data_analysis/combining_df_lig.R b/meta_data_analysis/combining_df_lig.R
deleted file mode 100644
index d9b6b9a..0000000
--- a/meta_data_analysis/combining_df_lig.R
+++ /dev/null
@@ -1,397 +0,0 @@
-#=======================================================================
-# TASK: To combine mcsm and meta data with af and or
-# by filtering for distance to ligand (<10Ang).
-# This script doesn't output anything.
-# This script is sourced from other .R scripts for plotting ligand plots
-
-# Input csv files:
-# 1) mcsm normalised and struct params
-# 2) gene associated meta_data_with_AFandOR
-#=======================================================================
-#%% specify curr dir
-getwd()
-setwd('~/git/LSHTM_analysis/meta_data_analysis/')
-getwd()
-#=======================================================================
-#%% load packages			 
-#require(data.table)
-#require(arsenal)
-#require(compare)
-#library(tidyverse)
-
-# header file
-header_dir = '~/git/LSHTM_analysis/'
-source(paste0(header_dir, '/', 'my_header.R'))
-#%% variable assignment: input and output paths & filenames
-#=======================================================================
-drug = 'pyrazinamide'
-gene = 'pncA'
-gene_match = paste0(gene,'_p.')
-cat(gene_match)
-
-#===========
-# data dir
-#===========
-datadir = '~/git/Data'
-
-#===========
-# input
-#===========
-# infile1: mCSM data
-#indir = '~/git/Data/pyrazinamide/input/processed/'
-indir = paste0(datadir, '/', drug, '/', 'output') # revised {TODO: change in mcsm pipeline}
-#in_filename = 'mcsm_complex1_normalised.csv'
-in_filename = 'pnca_mcsm_struct_params.csv' 
-infile = paste0(indir, '/', in_filename)
-cat(paste0('Reading infile1: mCSM output file', ' ', infile) )
-
-# infile2: gene associated meta data combined with AF and OR
-#indir: same as above
-in_filename_comb = paste0(tolower(gene),  '_meta_data_with_AFandOR.csv')
-infile_comb = paste0(indir, '/', in_filename_comb)
-cat(paste0('Reading infile2: gene associated combined metadata:', infile_comb))
-
-#===========
-# output
-#===========
-# Uncomment if and when required to output
-outdir = paste0(datadir, '/', drug, '/', 'output') #same as indir
-cat('Output dir: ', outdir, '\n')
-#out_filename = paste0(tolower(gene), 'XXX')
-#outfile = paste0(outdir, '/', out_filename)
-#cat(paste0('Output file with full path:', outfile))
-
-#%% end of variable assignment for input and output files
-#=======================================================================
-#%% Read input files
-
-#####################################
-# input file 1: mcsm normalised data
-# output of step 4 mcsm_pipeline
-#####################################
-
-cat('Reading mcsm_data:'
-    , '\nindir: ', indir
-    , '\ninfile_comb: ', in_filename)
-
-mcsm_data = read.csv(infile
-#                     , row.names = 1
-                     , stringsAsFactors = F
-                     , header = T)
-
-cat('Read mcsm_data file:'
-    , '\nNo.of rows: ', nrow(mcsm_data)
-    , '\nNo. of cols:', ncol(mcsm_data))
-
-# clear variables
-rm(in_filename, infile)
-
-str(mcsm_data)
-
-table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
-
-# spelling Correction 1: DUET
-mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising'
-mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising'
-
-# checks: should be the same as above
-table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
-head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome)
-
-# spelling Correction 2: Ligand
-table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
-
-mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising'
-mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising'
-
-# checks: should be the same as above
-table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
-head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome)
-
-# muts with opposing effects on protomer and ligand stability
-# excluded from here as it is redundant. 
-# check 'combining_two_df.R' to refer if required.
-#=======================================================================
-#%% !!!Filter data only for mcsm lig!!!
-
-########################### 
-# Filter/subset data 
-# Lig plots < 10Ang
-# Filter the lig plots for Dis_to_lig < 10Ang
-###########################
-
-# check range of distances
-max(mcsm_data$Dis_lig_Ang)
-min(mcsm_data$Dis_lig_Ang)
-
-# count
-table(mcsm_data$Dis_lig_Ang<10)
-
-# subset data to have only values less than 10 Ang
-mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10)
-
-# sanity checks
-max(mcsm_data2$Dis_lig_Ang)
-min(mcsm_data2$Dis_lig_Ang)
-
-# count no of unique positions
-length(unique(mcsm_data2$Position))
-
-# count no of unique mutations
-length(unique(mcsm_data2$Mutationinformation))
-
-# count Destabilisinga and stabilising
-table(mcsm_data2$Lig_outcome) #{RESULT: no of mutations within 10Ang}
-
-#############################
-# Extra sanity check:
-# for mcsm_lig ONLY
-# Dis_lig_Ang should be <10
-#############################
-
-if (max(mcsm_data2$Dis_lig_Ang) < 10){
-  print ("Sanity check passed: lig data is <10Ang")
-}else{
-  print ("Error: data should be filtered to be within 10Ang")
-}
-
-#!!!!!!!!!!!!!!!!!!!!!
-# REASSIGNMENT: so as not to alter the script
-mcsm_data = mcsm_data2
-#!!!!!!!!!!!!!!!!!!!!!
-#=======================================================================
-# clear variables
-rm(mcsm_data2)
-
-# count na in each column
-na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count
-
-# sort by Mutationinformation
-mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),]
-head(mcsm_data$Mutationinformation)
-
-orig_col = ncol(mcsm_data)
-# get freq count of positions and add to the df
-setDT(mcsm_data)[, mut_pos_occurrence := .N, by = .(position)] 
-
-cat('Added col: position frequency to see which posn has how many muts'
-    , '\nNo. of cols now', ncol(mcsm_data)
-    , '\nNo. of cols before: ', orig_col)
-
-mut_pos_occurrence = data.frame(mcsm_data$id, mcsm_data$Position, mcsm_data$mut_pos_occurrence)
-
-######################################
-# input file2 meta data with AFandOR
-######################################
-cat('Reading combined meta data and AFandOR file:'
-    , '\nindir: ', indir
-    , '\ninfile_comb: ', in_filename_comb)
-
-meta_with_afor <- read.csv(infile_comb
-                      , stringsAsFactors = F
-                      , header = T)
-
-cat('Read mcsm_data file:'
-    , '\nNo.of rows: ', nrow(meta_with_afor)
-    , '\nNo. of cols:', ncol(meta_with_afor))
-
-# clear variables
-rm(in_filename_comb, infile_comb)
-
-str(meta_with_afor)
-
-# sort by Mutationinformation
-head(meta_with_afor$Mutationinformation)
-meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),]
-head(meta_with_afor$Mutationinformation)
-
-orig_col2 = ncol(meta_with_afor)
-
-# get freq count of positions and add to the df
-setDT(meta_with_afor)[, sample_pos_occurrence := .N, by = .(Position)] 
-
-cat('Added col: position frequency of samples to check'
-	,'how many samples correspond to a partiulcar posn associated with muts'
-    , '\nNo. of cols now', ncol(meta_with_afor)
-    , '\nNo. of cols before: ', orig_col2)
-
-sample_pos_occurrence = data.frame(meta_with_afor$id, meta_with_afor$position, meta_with_afor$sample_pos_occurrence)
-
-#=======================================================================
-cat('Begin merging dfs with NAs'
-	, '\n===============================================================')
-
-###########################
-# merging two dfs: with NA
-###########################
-# link col name  = 'Mutationinforamtion'
-cat('Merging dfs with NAs: big df (1-many relationship b/w id & mut)'
-    ,'\nlinking col: Mutationinforamtion'
-    ,'\nfilename: merged_df2')
-
-head(mcsm_data$Mutationinformation)
-head(meta_with_afor$Mutationinformation)
-
-#########
-# a) merged_df2: meta data with mcsm
-#########
-merged_df2 = merge(x = meta_with_afor
-                  , y = mcsm_data
-                  , by = 'Mutationinformation'
-                  , all.y = T)
-
-cat('Dim of merged_df2: '
-    , '\nNo. of rows: ', nrow(merged_df2)
-    , '\nNo. of cols: ', ncol(merged_df2))
-head(merged_df2$Position)
-
-if(nrow(meta_with_afor) == nrow(merged_df2)){
-  cat('nrow(merged_df2) = nrow (gene associated metadata)'
-      ,'\nExpected no. of rows: ',nrow(meta_with_afor) 
-      ,'\nGot no. of rows: ', nrow(merged_df2))
-} else{
-  cat('nrow(merged_df2)!= nrow(gene associated metadata)'
-      , '\nExpected no. of rows after merge: ', nrow(meta_with_afor)
-      , '\nGot no. of rows: ', nrow(merged_df2)
-      , '\nFinding discrepancy')
-  merged_muts_u = unique(merged_df2$Mutationinformation)
-  meta_muts_u = unique(meta_with_afor$Mutationinformation)
-    # find the index where it differs
-  unique(meta_muts_u[! meta_muts_u %in% merged_muts_u])
-}
-
-# sort by Position
-head(merged_df2$Position)
-merged_df2 = merged_df2[order(merged_df2$Position),]
-head(merged_df2$Position)
-
-merged_df2v2 = merge(x = meta_with_afor
-                   , y = mcsm_data
-                   , by = 'Mutationinformation'
-                   , all.x = T) 
-#!=!=!=!=!=!=!=!
-# COMMENT: used all.y since position 186 is not part of the struc,
-# hence doesn't have a mcsm value
-# but 186 is associated with mutation
-#!=!=!=!=!=!=!=!
-
-# should  be False
-identical(merged_df2, merged_df2v2)
-table(merged_df2$Position%in%merged_df2v2$Position)
-
-rm(merged_df2v2)
-
-#########
-# b) merged_df3: remove duplicate mutation information
-#########
-cat('Merging dfs with NAs: small df (removing duplicate muts)'
-    ,'\nCannot trust lineage info from this'
-    ,'\nlinking col: Mutationinforamtion'
-    ,'\nfilename: merged_df3')
-
-#==#=#=#=#=#=#
-# Cannot trust lineage, country from this df as the same mutation
-# can have many different lineages
-# but this should be good for the numerical corr plots
-#=#=#=#=#=#=#=
-merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),] 
-head(merged_df3$Position); tail(merged_df3$Position) # should be sorted
-
-# sanity checks
-# nrows of merged_df3 should be the same as the nrows of mcsm_data
-if(nrow(mcsm_data) == nrow(merged_df3)){
-  cat('PASS: No. of rows match with mcsm_data'
-      ,'\nExpected no. of rows: ', nrow(mcsm_data)
-      ,'\nGot no. of rows: ', nrow(merged_df3))
-} else {
-  cat('FAIL: No. of rows mismatch'
-      , '\nNo. of rows mcsm_data: ', nrow(mcsm_data)
-      , '\nNo. of rows merged_df3: ', nrow(merged_df3))
-}
-
-# counting NAs in AF, OR cols in merged_df3
-# this is becuase mcsm has no AF, OR cols,
-# so you cannot count NAs
-if (identical(sum(is.na(merged_df3$OR))
-              , sum(is.na(merged_df3$pvalue))
-              , sum(is.na(merged_df3$AF)))){
-  cat('PASS: NA count match for OR, pvalue and AF\n')
-  na_count_df3 = sum(is.na(merged_df3$AF))
-  cat('No. of NAs: ', sum(is.na(merged_df3$OR)))
-} else{
-  cat('FAIL: NA count mismatch'
-      , '\nNA in OR: ', sum(is.na(merged_df3$OR))
-      , '\nNA in pvalue: ', sum(is.na(merged_df3$pvalue))
-      , '\nNA in AF:', sum(is.na(merged_df3$AF)))
-}
-#=======================================================================
-#%% merging without NAs
-
-cat('Begin merging dfs without NAs'
-	, '\n===============================================================')
-
-cat('Merging dfs without any NAs: big df (1-many relationship b/w id & mut)'
-    ,'\nlinking col: Mutationinforamtion'
-    ,'\nfilename: merged_df2_comp')
-
-#########
-# c) merged_df2_comp: merging two dfs without NA
-#########
-merged_df2_comp = merged_df2[!is.na(merged_df2$AF),] 
-#merged_df2_comp = merged_df2[!duplicated(merged_df2$Mutationinformation),] 
-
-# sanity check
-cat('Checking nrows in merged_df2_comp')
-if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count + 1)){
-  cat('PASS: No. of rows match'
-      ,'\nDim of merged_df2_comp: '
-      ,'\nExpected no. of rows: ', nrow(merged_df2) - na_count + 1
-      , '\nNo. of rows: ', nrow(merged_df2_comp)
-      , '\nNo. of cols: ', ncol(merged_df2_comp))
-}else{
-  cat('FAIL: No. of rows mismatch'
-      ,'\nExpected no. of rows: ', nrow(merged_df2) - na_count + 1
-      ,'\nGot no. of rows: ', nrow(merged_df2_comp))
-}
-
-#########
-# d) merged_df3_comp: remove duplicate mutation information
-#########
-merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),] 
-
-cat('Dim of merged_df3_comp: '
-    , '\nNo. of rows: ', nrow(merged_df3_comp)
-    , '\nNo. of cols: ', ncol(merged_df3_comp))
-
-# alternate way of deriving merged_df3_comp
-foo = merged_df3[!is.na(merged_df3$AF),]
-# compare dfs: foo and merged_df3_com
-all.equal(foo, merged_df3)
-
-summary(comparedf(foo, merged_df3))
-
-# sanity check
-cat('Checking nrows in merged_df3_comp')
-if(nrow(merged_df3_comp) == nrow(merged_df3)){
-  cat('NO NAs detected in merged_df3 in AF|OR cols'
-      ,'\nNo. of rows are identical: ', nrow(merged_df3))
-} else{
-  if(nrow(merged_df3_comp) == nrow(merged_df3) - na_count_df3) {
-  cat('PASS: NAs detected in merged_df3 in AF|OR cols'
-      , '\nNo. of NAs: ', na_count_df3
-      , '\nExpected no. of rows in merged_df3_comp: ', nrow(merged_df3) - na_count_df3
-      , '\nGot no. of rows: ', nrow(merged_df3_comp))
-  }
-}
-
-#======================================================================= 
-# write_output files
-# Not required as this is a subset of the combining_two_df.R
-#*************************
-# clear variables
-rm(mcsm_data, meta_with_afor, foo, drug, gene, gene_match, indir, merged_muts_u, meta_muts_u, na_count, orig_col, outdir)
-rm(mut_pos_occurrence)
-#%% end of script
-#=======================================================================
-
-
diff --git a/meta_data_analysis/combining_df_ps.R b/meta_data_analysis/combining_df_ps.R
deleted file mode 100644
index 770661d..0000000
--- a/meta_data_analysis/combining_df_ps.R
+++ /dev/null
@@ -1,461 +0,0 @@
-#=======================================================================
-# TASK: To combine mcsm and meta data with af and or files  
-# Input csv files:
-# 1) mcsm normalised and struct params
-# 2) gene associated meta_data_with_AFandOR
-
-# Output: 
-# 1) muts with opposite effects on stability
-# 2) large combined df including NAs for AF, OR,etc
-# 		Dim: same no. of rows as gene associated meta_data_with_AFandOR
-# 3) small combined df including NAs for AF, OR, etc.
-# 		Dim: same as mcsm data
-# 4) large combined df excluding NAs 
-# 		Dim: dim(#1) - no. of NAs(AF|OR) + 1
-# 5) small combined df excluding NAs
-# 		Dim: dim(#2) - no. of unique NAs - 1
-# This script is sourced from other .R scripts for plotting
-#=======================================================================
-#%% specify curr dir
-getwd()
-setwd('~/git/LSHTM_analysis/meta_data_analysis/')
-getwd()
-#=======================================================================
-#%% load packages
-#require(data.table)
-#require(arsenal)
-#require(compare)
-#library(tidyverse)
-
-# header file
-header_dir = '~/git/LSHTM_analysis/'
-source(paste0(header_dir, '/', 'my_header.R'))
-#=======================================================================
-#%% variable assignment: input and output paths & filenames
-drug = 'pyrazinamide'
-gene = 'pncA'
-gene_match = paste0(gene,'_p.')
-cat(gene_match)
-
-#===========
-# data dir
-#===========
-datadir = '~/git/Data'
-
-#===========
-# input
-#===========
-# infile1: mCSM data
-#indir = '~/git/Data/pyrazinamide/input/processed/'
-indir = paste0(datadir, '/', drug, '/', 'output') # revised {TODO: change in mcsm pipeline}
-#in_filename = 'mcsm_complex1_normalised.csv'
-in_filename = 'pnca_mcsm_struct_params.csv' 
-infile = paste0(indir, '/', in_filename)
-cat(paste0('Reading infile1: mCSM output file', ' ', infile, '\n') )
-
-# infile2: gene associated meta data combined with AF and OR
-#indir: same as above
-in_filename_comb = paste0(tolower(gene),  '_meta_data_with_AFandOR.csv')
-infile_comb = paste0(indir, '/', in_filename_comb)
-cat(paste0('Reading infile2: gene associated combined metadata:', infile_comb, '\n'))
-
-#===========
-# output
-#===========
-# Uncomment if and when required to output
-outdir = paste0(datadir, '/', drug, '/', 'output') #same as indir
-cat('Output dir: ', outdir, '\n')
-#out_filename = paste0(tolower(gene), 'XXX')
-#outfile = paste0(outdir, '/', out_filename)
-#cat(paste0('Output file with full path:', outfile))
-
-#%% end of variable assignment for input and output files
-#=======================================================================
-#%% Read input files
-
-#####################################
-# input file 1: mcsm normalised data
-# output of step 4 mcsm_pipeline
-#####################################
-cat('Reading mcsm_data:'
-    , '\nindir: ', indir
-    , '\ninfile_comb: ', in_filename)
-
-mcsm_data = read.csv(infile
-#                     , row.names = 1
-                     , stringsAsFactors = F
-                     , header = T)
-
-cat('Read mcsm_data file:'
-    , '\nNo.of rows: ', nrow(mcsm_data)
-    , '\nNo. of cols:', ncol(mcsm_data))
-
-# clear variables
-rm(in_filename, infile)
-
-str(mcsm_data)
-
-table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
-
-# spelling Correction 1: DUET
-mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising'
-mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising'
-
-# checks: should be the same as above
-table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
-head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome)
-
-# spelling Correction 2: Ligand
-table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
-
-mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising'
-mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising'
-
-# checks: should be the same as above
-table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
-head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome)
-
-# muts with opposing effects on protomer and ligand stability
-table(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome)
-changes = mcsm_data[which(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome),]
-
-# sanity check: redundant, but uber cautious!
-dl_i = which(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome)
-ld_i = which(mcsm_data$Lig_outcome != mcsm_data$DUET_outcome)
-
-cat('Identifying muts with opposite stability effects')
-if(nrow(changes) == (table(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome)[[2]]) & identical(dl_i,ld_i)) {
-  cat('PASS: muts with opposite effects on stability and affinity correctly identified'
-        , '\nNo. of such muts: ', nrow(changes))
-}else {
-  cat('FAIL: unsuccessful in extracting muts with changed stability effects')
-}
-
-#***************************
-# write file: changed muts
-out_filename = 'muts_opp_effects.csv'
-outfile = paste0(outdir, '/', out_filename)
-cat('Writing file for muts with opp effects:'
-    , '\nFilename: ', outfile
-    , '\nPath: ', outdir)
-
-write.csv(changes, outfile)
-#****************************
-# clear variables
-rm(out_filename, outfile)
-rm(changes, dl_i, ld_i)
-
-# count na in each column
-na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count
-
-# sort by Mutationinformation
-mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),]
-head(mcsm_data$Mutationinformation)
-
-orig_col = ncol(mcsm_data)
-
-# get freq count of positions and add to the df
-setDT(mcsm_data)[, mut_pos_occurrence := .N, by = .(Position)] 
-
-cat('Added col: position frequency of muts to see which posn has how many muts'
-    , '\nNo. of cols now', ncol(mcsm_data)
-    , '\nNo. of cols before: ', orig_col)
-
-mut_pos_occurrence = data.frame(mcsm_data$Mutationinformation
-                                , mcsm_data$Position
-                                , mcsm_data$mut_pos_occurrence)
-
-colnames(mut_pos_occurrence) = c('Mutationinformation', 'position', 'mut_pos_occurrence')
-#######################################
-# input file 2: meta data with AFandOR
-#######################################
-cat('Reading combined meta data and AFandOR file:'
-    , '\nindir: ', indir
-    , '\ninfile_comb: ', in_filename_comb)
-
-meta_with_afor <- read.csv(infile_comb
-                      , stringsAsFactors = F
-                      , header = T)
-
-cat('Read mcsm_data file:'
-    , '\nNo.of rows: ', nrow(meta_with_afor)
-    , '\nNo. of cols:', ncol(meta_with_afor))
-
-# counting NAs in AF, OR cols
-if (identical(sum(is.na(meta_with_afor$OR))
-              , sum(is.na(meta_with_afor$pvalue))
-              , sum(is.na(meta_with_afor$AF)))){
-  cat('PASS: NA count match for OR, pvalue and AF\n')
-  na_count = sum(is.na(meta_with_afor$AF))
-  cat('No. of NAs: ', sum(is.na(meta_with_afor$OR)))
-} else{
-  cat('FAIL: NA count mismatch'
-      , '\nNA in OR: ', sum(is.na(meta_with_afor$OR))
-      , '\nNA in pvalue: ', sum(is.na(meta_with_afor$pvalue))
-      , '\nNA in AF:', sum(is.na(meta_with_afor$AF)))
-}
-
-# clear variables
-rm(in_filename_comb, infile_comb)
-
-str(meta_with_afor)
-
-# sort by Mutationinformation
-head(meta_with_afor$Mutationinformation)
-meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),]
-head(meta_with_afor$Mutationinformation)
-
-orig_col2 = ncol(meta_with_afor)
-
-# get freq count of positions and add to the df
-setDT(meta_with_afor)[, sample_pos_occurrence := .N, by = .(position)] 
-
-cat('Added col: position frequency of samples to check'
-	,'how many samples correspond to a partiulcar posn associated with muts'
-    , '\nNo. of cols now', ncol(meta_with_afor)
-    , '\nNo. of cols before: ', orig_col2)
-
-sample_pos_occurrence = data.frame(meta_with_afor$id
-                                   , meta_with_afor$mutation
-                                   , meta_with_afor$Mutationinformation
-                                   , meta_with_afor$position
-                                   , meta_with_afor$sample_pos_occurrence)
-colnames(sample_pos_occurrence) = c('id', 'mutation', 'Mutationinformation', 'position', 'sample_pos_occurrence')
-#=======================================================================
-cat('Begin merging dfs with NAs'
-	, '\n===============================================================')
-
-###########################
-# merging two dfs: with NA
-###########################
-# link col name  = 'Mutationinforamtion'
-head(mcsm_data$Mutationinformation)
-head(meta_with_afor$Mutationinformation)
-
-cat('Merging dfs with NAs: big df (1-many relationship b/w id & mut)'
-    ,'\nlinking col: Mutationinforamtion'
-    ,'\nfilename: merged_df2')
-
-#########
-# a) merged_df2: meta data with mcsm
-#########
-merged_df2 = merge(x = meta_with_afor
-                  ,y = mcsm_data
-                  , by = 'Mutationinformation'
-                  , all.y = T)
-
-cat('Dim of merged_df2: '
-    , '\nNo. of rows: ', nrow(merged_df2)
-    , '\nNo. of cols: ', ncol(merged_df2))
-head(merged_df2$Position)
-
-# sanity check
-cat('Checking nrows in merged_df2')
-if(nrow(meta_with_afor) == nrow(merged_df2)){
-  cat('nrow(merged_df2) = nrow (gene associated metadata)'
-      ,'\nExpected no. of rows: ',nrow(meta_with_afor) 
-      ,'\nGot no. of rows: ', nrow(merged_df2))
-} else{
-  cat('nrow(merged_df2)!= nrow(gene associated metadata)'
-      , '\nExpected no. of rows after merge: ', nrow(meta_with_afor)
-      , '\nGot no. of rows: ', nrow(merged_df2)
-      , '\nFinding discrepancy')
-  merged_muts_u = unique(merged_df2$Mutationinformation)
-  meta_muts_u = unique(meta_with_afor$Mutationinformation)
-    # find the index where it differs
-  unique(meta_muts_u[! meta_muts_u %in% merged_muts_u])
-}
-
-# sort by Position
-head(merged_df2$Position)
-merged_df2 = merged_df2[order(merged_df2$Position),]
-head(merged_df2$Position)
-
-merged_df2v2 = merge(x = meta_with_afor
-                   ,y = mcsm_data
-                   , by = 'Mutationinformation'
-                   , all.x = T) 
-#!=!=!=!=!=!=!=!
-# COMMENT: used all.y since position 186 is not part of the struc,
-# hence doesn't have a mcsm value
-# but 186 is associated with mutation
-#!=!=!=!=!=!=!=!
-
-# should  be False
-identical(merged_df2, merged_df2v2)
-table(merged_df2$Position%in%merged_df2v2$Position)
-
-rm(merged_df2v2)
-
-#########
-# b) merged_df3:remove duplicate mutation information
-#########
-cat('Merging dfs without NAs: small df (removing muts with no AF|OR associated)'
-    ,'\nCannot trust lineage info from this'
-    ,'\nlinking col: Mutationinforamtion'
-    ,'\nfilename: merged_df3')
-
-#==#=#=#=#=#=#
-# Cannot trust lineage, country from this df as the same mutation
-# can have many different lineages
-# but this should be good for the numerical corr plots
-#=#=#=#=#=#=#=
-merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),] 
-head(merged_df3$Position); tail(merged_df3$Position) # should be sorted
-
-# sanity check
-cat('Checking nrows in merged_df3')
-if(nrow(mcsm_data) == nrow(merged_df3)){
-  cat('PASS: No. of rows match with mcsm_data'
-      ,'\nExpected no. of rows: ', nrow(mcsm_data)
-      ,'\nGot no. of rows: ', nrow(merged_df3))
-} else {
-  cat('FAIL: No. of rows mismatch'
-      , '\nNo. of rows mcsm_data: ', nrow(mcsm_data)
-      , '\nNo. of rows merged_df3: ', nrow(merged_df3))
-}
-
-# counting NAs in AF, OR cols in merged_df3
-# this is becuase mcsm has no AF, OR cols,
-# so you cannot count NAs
-if (identical(sum(is.na(merged_df3$OR))
-              , sum(is.na(merged_df3$pvalue))
-              , sum(is.na(merged_df3$AF)))){
-  cat('PASS: NA count match for OR, pvalue and AF\n')
-  na_count_df3 = sum(is.na(merged_df3$AF))
-  cat('No. of NAs: ', sum(is.na(merged_df3$OR)))
-} else{
-  cat('FAIL: NA count mismatch'
-      , '\nNA in OR: ', sum(is.na(merged_df3$OR))
-      , '\nNA in pvalue: ', sum(is.na(merged_df3$pvalue))
-      , '\nNA in AF:', sum(is.na(merged_df3$AF)))
-}
-#=======================================================================
-#%% merging without NAs
-
-cat('Begin merging dfs without NAs'
-	, '\n===============================================================')
-
-cat('Merging dfs without any NAs: big df (1-many relationship b/w id & mut)'
-    ,'\nlinking col: Mutationinforamtion'
-    ,'\nfilename: merged_df2_comp')
-
-#########
-# c) merged_df2_comp: same as merge 1 but excluding NA
-#########
-merged_df2_comp = merged_df2[!is.na(merged_df2$AF),] 
-#merged_df2_comp = merged_df2[!duplicated(merged_df2$Mutationinformation),] 
-
-# sanity check
-cat('Checking nrows in merged_df2_comp')
-if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count + 1)){
-  cat('PASS: No. of rows match'
-      ,'\nDim of merged_df2_comp: '
-      ,'\nExpected no. of rows: ', nrow(merged_df2) - na_count + 1
-      , '\nNo. of rows: ', nrow(merged_df2_comp)
-      , '\nNo. of cols: ', ncol(merged_df2_comp))
-}else{
-  cat('FAIL: No. of rows mismatch'
-      ,'\nExpected no. of rows: ', nrow(merged_df2) - na_count + 1
-      ,'\nGot no. of rows: ', nrow(merged_df2_comp))
-}
-
-#########
-# d) merged_df3_comp: remove duplicate mutation information
-#########
-merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),] 
-
-cat('Dim of merged_df3_comp: '
-    , '\nNo. of rows: ', nrow(merged_df3_comp)
-    , '\nNo. of cols: ', ncol(merged_df3_comp))
-
-# alternate way of deriving merged_df3_comp
-foo = merged_df3[!is.na(merged_df3$AF),]
-# compare dfs: foo and merged_df3_com
-all.equal(foo, merged_df3)
-
-summary(comparedf(foo, merged_df3))
-
-# sanity check
-cat('Checking nrows in merged_df3_comp')
-if(nrow(merged_df3_comp) == nrow(merged_df3)){
-  cat('NO NAs detected in merged_df3 in AF|OR cols'
-      ,'\nNo. of rows are identical: ', nrow(merged_df3))
-} else{
-  if(nrow(merged_df3_comp) == nrow(merged_df3) - na_count_df3) {
-  cat('PASS: NAs detected in merged_df3 in AF|OR cols'
-      , '\nNo. of NAs: ', na_count_df3
-      , '\nExpected no. of rows in merged_df3_comp: ', nrow(merged_df3) - na_count_df3
-      , '\nGot no. of rows: ', nrow(merged_df3_comp))
-  }
-}
-#======================================================================= 
-#*********************
-# writing 1 file in the style of a loop: merged_df3
-# print(output dir)
-#i = 'merged_df3'
-#out_filename = paste0(i, '.csv')
-#outfile = paste0(outdir, '/', out_filename)
-
-#cat('Writing output file: '
-#    ,'\nFilename: ', out_filename
-#    ,'\nPath: ', outdir)
-
-#template: write.csv(merged_df3, 'merged_df3.csv')
-#write.csv(get(i), outfile, row.names = FALSE)
-#cat('Finished writing: ', outfile
-#    , '\nNo. of rows: ', nrow(get(i))
-#    , '\nNo. of cols: ', ncol(get(i)))
-
-#%% write_output files;  all 4 files: 
-outvars = c('merged_df2'
-             , 'merged_df3'
-             , 'merged_df2_comp'
-             , 'merged_df3_comp')
-
-cat('Writing output files: '
-    , '\nPath:', outdir)
-
-for (i in outvars){
-#  cat(i, '\n')
-  out_filename = paste0(i, '.csv')
-#  cat(out_filename, '\n')
-#  cat('getting value of variable: ', get(i))
-  outfile = paste0(outdir, '/', out_filename)
-#  cat('Full output path: ', outfile, '\n')
-  cat('Writing output file:'
-      ,'\nFilename: ', out_filename,'\n')
-  write.csv(get(i), outfile, row.names = FALSE)
-  cat('Finished writing: ', outfile
-      , '\nNo. of rows: ', nrow(get(i))
-      , '\nNo. of cols: ', ncol(get(i)), '\n')
-}
-
-# alternate way to replace with implicit loop 
-# FIXME
-#sapply(outvars, function(x, y) write.csv(get(outvars), paste0(outdir, '/', outvars, '.csv')))
-
-#======================================================================= 
-#%% merging mut_pos_occurrence and sample_pos_occurence
-# FIXME
-#cat('Merging dfs with positional frequency from mcsm and meta_data'
-#    , '\nNcol in mut_pos_occurrence:', ncol(mut_pos_occurrence)
-#    , '\nncol in sample_pos_occurence:', ncol(sample_pos_occurrence)
-#    ,'\nlinking col:', intersect(colnames(sample_pos_occurrence), colnames(mut_pos_occurrence))
-#    ,'\nfilename: merged_df4')
-
-#merged_df4 = merge(sample_pos_occurrence, mut_pos_occurrence
-#                   , by = 'position'
-#                   , all = T)
-
-#out_filename4 = 'mut_and_sample_freq.csv'
-#outfile4 = paste0(outdir, '/', out_filename4)
-
-#*************************
-# clear variables
-rm(mcsm_data, meta_with_afor, foo, drug, gene, gene_match, indir, merged_muts_u, meta_muts_u, na_count, orig_col, outdir)
-rm(mut_pos_occurrence, sample_pos_occurrence)
-#rm(merged_df4)
-#%% end of script
-#=======================================================================
-
diff --git a/meta_data_analysis/dssp_df.py b/meta_data_analysis/dssp_df.py
deleted file mode 100755
index a12f7d5..0000000
--- a/meta_data_analysis/dssp_df.py
+++ /dev/null
@@ -1,170 +0,0 @@
-#!/home/tanu/anaconda3/envs/ContactMap/bin/python3
-# -*- coding: utf-8 -*-
-"""
-Created on Tue Feb 18 10:10:12 2020
-
-@author: tanu
-"""
-#=======================================================================
-# Task: Read a DSSP file into a data frame and output to a csv file
-
-# Input: '.dssp' i.e gene associated.dssp file (output from run_dssp.sh)
-
-# Output: '.csv' file containing DSSP output as a df with ASA, RSA, etc.
-	# based on Tien at al 2013 (theor.) values
-
-# useful links:
-# https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html
-# https://en.wikipedia.org/wiki/Relative_accessible_surface_area
-#=======================================================================
-#%% load packages
-import sys, os
-import re
-import pandas as pd
-from Bio.PDB import PDBParser
-from Bio.PDB.DSSP import DSSP
-import pprint as pp
-import dms_tools2
-import dms_tools2.dssp
-
-#=======================================================================#
-#%% specify homedir and curr dir
-homedir = os.path.expanduser('~') 
-
-# set working dir
-os.getcwd()
-os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
-os.getcwd()
-#=======================================================================
-#%% variable assignment: input and output 
-#drug = 'pyrazinamide'
-#gene = 'pncA'
-#gene_match = gene + '_p.'
-
-drug = 'cycloserine'
-gene = 'alr'
-#==========
-# data dir
-#==========
-#indir = 'git/Data/pyrazinamide/input/original'
-datadir = homedir + '/' + 'git/Data'
-
-#=======
-# input from outdir
-#=======
-indir = datadir + '/' + drug + '/' + 'input'
-#1) pdb file
-in_filename_pdb = gene.lower() + '_complex' + '.pdb'
-infile_pdb = indir + '/' + in_filename_pdb
-print('Input pdb filename:', in_filename_pdb
-      , '\npath:', indir
-      , '\n============================================================')
-
-#2) dssp file
-outdir = datadir + '/' + drug + '/' + 'output'
-in_filename = gene.lower() +'.dssp'
-infile = outdir + '/' + in_filename
-print('Input dssp filename:', in_filename
-      , '\npath:', outdir
-      , '\n============================================================')
-
-#=======
-# output 
-#=======
-outdir = datadir + '/' + drug + '/' + 'output'
-out_filename = gene.lower() + '_dssp.csv'
-outfile =  outdir + '/' + out_filename
-print('Output filename:', out_filename
-      , '\npath:', outdir
-      , '\nOutfile: ', outfile
-      , '\n=============================================================')
-#%% end of variable assignment for input and output files
-#=======================================================================
-#%% specify pdb chain as a list. Handy when more than 1 pdb chain
-my_chains = ['A']
-# my_chains = ['A', 'B'] # for cycloserine
-
-# generate my_chains from dssp
-p = PDBParser()
-structure = p.get_structure(in_filename_pdb, infile_pdb)
-model = structure[0]
-dssp = DSSP(model, infile_pdb)
-
-#print(dssp.keys())
-#print(dssp.keys()[0][0])
-#print(len(dssp))
-#print(dssp.keys()[0][0])
-#print(dssp.keys()[len(dssp)-1][0])
-
-dssp_chains = []
-for num_aa in range(0, len(dssp)):
-#    print(num_aa)
-#   extract the chain id only and append to a list
-    dssp_chains.append(dssp.keys()[num_aa][0])
-    chainsL = list(set(dssp_chains))
-    
-print(chainsL)
-# sort the list to make for sanity (since sets are not ordered)
-# this will be required for dssp_df
-my_chains = sorted(chainsL)
-
-print('dssp output for'
-      , in_filename, 'contains:', len(my_chains)
-      , 'chains:\n', my_chains)
-#%% ====================================================================
-# Process dssp output and extract into df (single chain)
-
-#dssp_df = dms_tools2.dssp.processDSSP(infile, chain = my_chains)
-#dssp_df['chain_dssp'] = my_chains
-#pp.pprint(dssp_df)
-#=======================================================================
-# incase pdb has > 1 chain and you need to run it for all chains
-
-# initialise an empty df
-dssp_df =  pd.DataFrame()
-print('Total no. of chains: ', len(my_chains))
-for chain_id in my_chains:
-    print('Chain id:', chain_id)
-    dssp_cur = pd.DataFrame()
-    dssp_cur = dms_tools2.dssp.processDSSP(infile, chain = chain_id)
-    #!!!Important!!!
-    dssp_cur['chain_id'] = chain_id 
-    dssp_df = dssp_df.append(dssp_cur)
-    pp.pprint(dssp_df)
-
-#=====================
-# Renaming amino-acid
-# and site columns
-#=====================
-# Rename column (amino acid) as 'wild_type' and (site} as 'position' 
-# to be the same names as used in the file required for merging later.
-dssp_df.columns
-dssp_df.rename(columns = {'site':'position', 'amino_acid':'wild_type_dssp'}, inplace = True)
-dssp_df.columns
-
-# sanity check
-if len(dssp_df) == len(dssp):
-    print('PASS: length of dssp_df has correct length')
-else:
-    print('FAIL: length mismatch for dssp_df'
-          , '\nexpected length:', len(dssp)
-          , '\nGot length:', len(dssp_df)
-          , 'Debug please!')
-    
-#%% Write ouput csv file
-print('Writing file:', outfile
-      , '\nFilename:', out_filename
-      , '\nPath:',  outdir
-      , '\n=============================================================')
-
-# write to csv
-dssp_df.to_csv(outfile, header=True, index = False)
-
-print('Finished writing:', out_filename
-     , '\nNo. of rows:', len(dssp_df)
-     , '\nNo. of cols:', len(dssp_df.columns)
-     , '\n==============================================================')
-#%% end of script
-#=======================================================================
-#%% run dssp to extract chain ids to later process the dssp output into a df
-
diff --git a/meta_data_analysis/init_data_dirs.py b/meta_data_analysis/init_data_dirs.py
deleted file mode 100755
index 5037258..0000000
--- a/meta_data_analysis/init_data_dirs.py
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/usr/bin/python3
-# Initialise a blank 'Data' directory and drug subdirs etc.
-# TODO:
-# - Read base dir from config file
-# - Create eg: '~/git/Data/{original,processed}
-# - Create eg: '~/git/Data/processed/' + drug (for each drug)
-# - Create eg: '~/git/Data/output/' + drug + '{plots, structure}'
-
-
-#%% specify homedir as python doesn't recognise tilde
-homedir = os.path.expanduser('~') 
-
-#%% variable assignment: input and output paths & filenames
-drug = 'pyrazinamide'
-gene = 'pncA'
-gene_match = gene + '_p.'
-
-#==========
-# data dir
-#==========
-#indir = 'git/Data/pyrazinamide/input/original'
-datadir = homedir + '/' + 'git/Data'
-
-#==========
-# input dir
-#==========
-indir = datadir + '/' + drug + '/' + 'input'
-
-#============
-# output dir
-#============
-# several output files
-outdir = datadir + '/' + drug + '/' + 'output'
-
-#%%end of variable assignment for input and output files
-#==============================================================================
diff --git a/meta_data_analysis/kd_df.py b/meta_data_analysis/kd_df.py
deleted file mode 100644
index c08f157..0000000
--- a/meta_data_analysis/kd_df.py
+++ /dev/null
@@ -1,193 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-'''
-Created on Tue Aug  6 12:56:03 2019
-
-@author: tanu
-'''
-#=======================================================================
-# Task: Hydrophobicity (Kd) values for amino acid sequence using the
-# Kyt&-Doolittle.
-# Same output as using the expasy server https://web.expasy.org/protscale/
-# Input: fasta file
-    
-# Output: csv file with 
-
-
-# useful links
-# https://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParamData-pysrc.html
-# https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html
-#=======================================================================
-#%% load packages
-from pylab import *
-from Bio.SeqUtils import ProtParamData
-from Bio.SeqUtils.ProtParam import ProteinAnalysis
-from Bio import SeqIO
-#from Bio.Alphabet.IUPAC import IUPACProtein
-#import pprint as pp
-import pandas as pd
-import numpy as np
-import sys, os
-#=======================================================================
-#%% specify input and curr dir
-homedir = os.path.expanduser('~')
-
-# set working dir
-os.getcwd()
-os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
-os.getcwd()
-#=======================================================================
-#%% variable assignment: input and output 
-drug = 'pyrazinamide'
-gene = 'pncA'
-gene_match = gene + '_p.'
-
-#==========
-# data dir
-#==========
-#indir = 'git/Data/pyrazinamide/input/original'
-datadir = homedir + '/' + 'git/Data'
-
-#=======
-# input
-#=======
-#indir = 'git/Data/pyrazinamide/input/original'
-indir = datadir + '/' + drug + '/' + 'input'
-in_filename = '3pl1.fasta.txt'
-infile = indir + '/' + in_filename
-print('Input filename:', in_filename
-      , '\nInput path:', indir
-      , '\n============================================================')
-
-#=======
-# output 
-#=======
-outdir =   datadir + '/' + drug + '/' + 'output'
-out_filename = gene.lower() + '_kd.csv'
-outfile =  outdir + '/' + out_filename
-print('Output filename:', out_filename
-      , '\nOutput path:', outdir
-      , '\n=============================================================')
-#%% end of variable assignment for input and output files
-#=======================================================================
-#===================
-#calculate KD values: same as the expasy server
-#===================
-#%%specify window size for hydropathy profile computation
-# https://web.expasy.org/protscale/pscale/protscale_help.html
-my_window = 3
-offset = round((my_window/2)-0.5)
-
-fh = open(infile)
-
-for record in SeqIO.parse(fh, 'fasta'):
-    id = record.id
-    seq = record.seq
-    num_residues = len(seq)
-fh.close()
-
-sequence = str(seq)
-
-X = ProteinAnalysis(sequence)
-
-kd_values = (X.protein_scale(ProtParamData.kd , window = my_window)) # edge weight is set to  default (100%)
-
-# sanity checks 
-print('Sequence Length:', num_residues)
-print('kd_values Length:',len(kd_values))
-print('Window Length:', my_window)
-print('Window Offset:', offset)
-print('=================================================================')
-print('Checking:len(kd values) is as expected for the given window size & offset...')
-expected_length =  num_residues - (my_window - offset) 
-if len(kd_values) == expected_length:
-    print('PASS: expected and actual length of kd values match')
-else:
-    print('FAIL: length mismatch'
-          ,'\nExpected length:', expected_length
-          ,'\nActual length:', len(kd_values)
-          , '\n=========================================================')
-#===================
-# creating two dfs
-#===================
-#%% make 2 dfs; 1) aa sequence and 2) kd_values. Then reset index for each df 
-# which will allow easy merging of the two dfs.
-
-# df1: df of aa seq with index reset to start from 1 (reflective of the actual aa position in a sequence)
-# Name column of wt as 'wild_type' to be the same name used in the file required for merging later.
-dfSeq = pd.DataFrame({'wild_type_kd':list(sequence)})
-dfSeq.index = np.arange(1, len(dfSeq) + 1) # python is not inclusive
-
-# df2: df of kd_values with index reset to start from offset + 1 and subsequent matched length of the kd_values
-dfVals = pd.DataFrame({'kd_values':kd_values})
-dfVals.index = np.arange(offset + 1, len(dfVals) + 1 + offset)
-
-# sanity checks
-max(dfVals['kd_values'])
-min(dfVals['kd_values'])
-
-#===================
-# concatenating dfs
-#===================
-# Merge the two on index 
-# (as these are now reflective of the aa position numbers): df1 and df2 
-# This will introduce NaN where there is missing values. In our case this
-# will be 2 (first and last ones based on window size and offset)
-# In our case this will be 2 (first and last ones)
-# For pnca: the last position is not part of the struc, so not info loss
-# Needless to say that this will be variable for other targets.
-
-kd_df = pd.concat([dfSeq, dfVals], axis = 1)
-
-#============================
-# renaming index to position
-#============================
-kd_df = kd_df.rename_axis('position')
-kd_df.head
-print('=================================================================')
-
-print('position col i.e. index should be numeric
-	, '\n===============================================================')
-	
-if kd_df.index.dtype == 'int64':
-    print('PASS: position col is numeric'
-          , '\ndtype is:', kd_df.index.dtype)
-else:
-    print('FAIL: position col is not numeric'
-          , '\nConverting to numeric')
-    kd_df.index.astype('int64')
-    print('Checking dtype for after conversion:\n'
-          , '\ndtype is:', kd_df.index.dtype
-          , '\n=========================================================')
-#===============
-# writing file
-#===============
-print('Writing file:', out_filename
-      , '\nFilename:', out_filename
-      , '\nPath:',  outdir
-      , '\n=============================================================')
-
-kd_df.to_csv(outfile, header = True, index = True)
-
-print('Finished writing:', out_filename
-      , '\nNo. of rows:', len(kd_df)
-      , '\nNo. of cols:', len(kd_df.columns)
-      , '\n=============================================================')
-
-#===============
-# plot: optional!
-#===============#%% plot
-# http://www.dalkescientific.com/writings/NBN/plotting.html
-
-# FIXME: save fig
-# extract just pdb if from 'id' to pass to title of plot
-# foo = re.match(r'(^[0-9]{1}\w{3})', id).groups(1)
-plot(kd_values, linewidth = 1.0)
-#axis(xmin = 1, xmax = num_residues)
-xlabel('Residue Number')
-ylabel('Hydrophobicity')
-title('K&D Hydrophobicity for ' + id)
-show()
-print('======================================================================')
-#%% end of script
-#=======================================================================
diff --git a/meta_data_analysis/mut_electrostatic_changes.py b/meta_data_analysis/mut_electrostatic_changes.py
deleted file mode 100755
index 20ea86a..0000000
--- a/meta_data_analysis/mut_electrostatic_changes.py
+++ /dev/null
@@ -1,167 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-'''
-Created on Tue Aug  6 12:56:03 2019
-
-@author: tanu
-'''
-
-# FIXME: import dirs.py to get the basic dir paths available
-#=======================================================================
-# TASK: calculate how many mutations result in 
-# electrostatic changes wrt wt
-
-# Input: mcsm and AF_OR file
-
-# Output: mut_elec_changes_results.txt 
-#=======================================================================
-#%% load libraries
-import os, sys
-import pandas as pd
-#import numpy as np
-#=======================================================================
-#%% specify homedir and curr dir
-homedir = os.path.expanduser('~') 
-
-# set working dir
-os.getcwd()
-os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
-os.getcwd()
-#=======================================================================
-#%% variable assignment: input and output 
-drug = 'pyrazinamide'
-gene = 'pncA'
-gene_match = gene + '_p.'
-
-#==========
-# data dir
-#==========
-#indir = 'git/Data/pyrazinamide/input/original'
-datadir = homedir + '/' + 'git/Data'
-
-#=======
-# input
-#=======
-indir = datadir + '/' + drug + '/' + 'input'
-in_filename  = 'merged_df3.csv'
-infile = outdir + '/' + in_filename
-print('Input filename: ', in_filename
-      , '\nInput path: ', indir
-      , '\n============================================================')
-
-#=======
-# output 
-#=======
-outdir = datadir + '/' + drug + '/' + 'output'
-# specify output file 
-out_filename = 'mut_elec_changes.txt'
-outfile =  outdir + '/' + out_filename
-print('Output filename: ', out_filename
-      , '\nOutput path: ', outdir
-      , '\n============================================================')
-
-#%% end of variable assignment for input and output files
-#=======================================================================
-#%% Read input files
-print('Reading input file (merged file):', infile)
-
-comb_df = pd.read_csv(infile, sep = ',')  
-
-print('Input filename: ', in_filename
-      , '\nPath :', outdir
-      , '\nNo. of rows: ', len(comb_df)
-      , '\nNo. of cols: ', infile
-      , '\n============================================================')
-
-# column names
-list(comb_df.columns)
-
-# clear variables
-del(in_filename, infile)
-
-#%% subset unique mutations
-df = comb_df.drop_duplicates(['Mutationinformation'], keep = 'first')
-
-total_muts = df.Mutationinformation.nunique()
-#df.Mutationinformation.count()
-print('Total mutations associated with structure: ', total_muts
-	, '\n===============================================================')
-
-#%% combine aa_calcprop cols so that you can count the changes as value_counts
-# check if all muts have been categorised
-print('Checking if all muts have been categorised: ')
-if df['wt_calcprop'].isna().sum() == 0 & df['mut_calcprop'].isna().sum():
-    print('PASS: No. NA detected i.e all muts have aa prop associated'
-    , '\n===============================================================')
-else:
-    print('FAIL: NAs detected i.e some muts remain unclassified'
-    , '\n===============================================================')
-
-df['wt_calcprop'].head()
-df['mut_calcprop'].head()
-
-print('Combining wt_calcprop and mut_calcprop...')
-#df['aa_calcprop_combined'] = df['wt_calcprop']+ '->' + df['mut_calcprop']
-df['aa_calcprop_combined'] = df.wt_calcprop.str.cat(df.mut_calcprop, sep = '->')
-df['aa_calcprop_combined'].head()
-
-mut_categ = df["aa_calcprop_combined"].unique()
-print('Total no. of aa_calc properties: ', len(mut_categ))
-print('Categories are: ', mut_categ)
-
-# counting no. of muts in each mut categ
-
-# way1: count values within each combinaton
-df.groupby('aa_calcprop_combined').size()
-#df.groupby('aa_calcprop_combined').count()
-
-# way2: count values within each combinaton
-df['aa_calcprop_combined'].value_counts()
-
-# comment: the two ways should be identical
-# groupby result order is similar to pivot table order,
-# I prefer the value_counts look
-
-# assign to variable: count values within each combinaton
-all_prop = df['aa_calcprop_combined'].value_counts()
-
-# convert to a df from Series
-ap_df = pd.DataFrame({'aa_calcprop': all_prop.index, 'mut_count': all_prop.values})
-
-# subset df to contain only the changes in prop
-all_prop_change = ap_df[ap_df['aa_calcprop'].isin(['neg->neg','non-polar->non-polar','polar->polar', 'pos->pos']) == False]
-                      
-elec_count = all_prop_change.mut_count.sum()
-print('Total no.of muts with elec changes: ', elec_count)
-
-# calculate percentage of electrostatic changes
-elec_changes = (elec_count/total_muts) * 100
-           
-print('Total number of electrostatic changes resulting from Mutation is (%):', elec_changes)
- 
-# check no change muts
-no_change_muts = ap_df[ap_df['aa_calcprop'].isin(['neg->neg','non-polar->non-polar','polar->polar', 'pos->pos']) == True]
-  
-no_change_muts.mut_count.sum()
-                     
-#%% output from console
-#sys.stdout = open(file, 'w')
-sys.stdout = open(outfile, 'w')
-     
-print('======================\n'
-      ,'Unchanged muts'
-      ,'\n=====================\n'
-      , no_change_muts
-      ,'\n=============================\n'      
-      , 'Muts with changed prop:'
-      , '\n============================\n'
-      , all_prop_change)
-
-print('================================================================='                
-, '\nTotal number of electrostatic changes resulting from Mtation is (%):', elec_changes
-, '\nTotal no. of muts: ', total_muts
-, '\nTotal no. of changed muts: ', all_prop_change.mut_count.sum()
-, '\nTotal no. of unchanged muts: ', no_change_muts.mut_count.sum() 
-, '\n===================================================================')  
-#%% end of script
-#=======================================================================
diff --git a/meta_data_analysis/mutate.py b/meta_data_analysis/mutate.py
deleted file mode 100644
index fcabc70..0000000
--- a/meta_data_analysis/mutate.py
+++ /dev/null
@@ -1,179 +0,0 @@
-#!/usr/bin/python
-from __future__ import print_function
-from Bio import SeqIO
-from Bio.Seq import Seq
-from collections import OrderedDict
-import sys
-import argparse
-# https://github.com/jrjhealey/bioinfo-tools/blob/master/Mutate.py
-# https://www.biostars.org/p/336891/
-
-# TODO:
-#  - create some logic to 'group' mutations that will be applied to the same sequence, to
-#    make all switches at once
-#    - This will also probably break the verbose transversion output so the maths will need replacing
-#  - Create the ability to support INDELS (will also require pairwise alignment so that
-#    hamming distances remain meaningful.
-
-
-def get_args():
-    """Parse command line arguments"""
-    desc = "Mutate fasta sequences based on a file of sequence mappings."
-    epi = (
-        "This script takes a mapfile of the form:\n"
-        " SequenceID,A123B\n"
-        " SequenceID,X456Y\n"
-        "And performs substitutions/mutations. At preset it only does one SNP per sequence.\n"
-    )
-
-    try:
-        parser = argparse.ArgumentParser(
-            description=desc, epilog=epi, formatter_class=argparse.RawTextHelpFormatter
-        )
-        parser.add_argument(
-            "mutation_file",
-            action="store",
-            help='File of mutation mappings like so: "SeqID,X123Y"',
-        )
-        parser.add_argument(
-            "sequences",
-            action="store",
-            help="File of sequences to be mutated (fasta only).",
-        )
-        parser.add_argument(
-            "-v",
-            "--verbose",
-            action="store_true",
-            help="Verbose behaviour, printing parameters of the script.",
-        )
-        parser.add_argument(
-            "-o",
-            "--outfile",
-            action="store",
-            help="Output file for mutated sequences (default STDOUT).",
-        )
-        if len(sys.argv) == 1:
-            parser.print_help(sys.stderr)
-            exit(1)
-    except:
-        sys.stderr.write(
-            "An exception occurred with argument parsing. Check your provided options.\n"
-        )
-
-    return parser.parse_args()
-
-
-class Mutation(object):
-    """A class wrapper for sequence IDs so that duplicate IDs can be used in a dictionary"""
-
-    def __init__(self, name):
-        self.name = name
-
-    def __repr__(self):
-        return "'" + self.name + "'"
-
-    def __str__(self):
-        return self.name
-
-
-def parse_mapfile(mapfile):
-    """Return a dict of mapped mutations.
-    File should resemble:
-     SequenceID,A123B
-     SequenceID2,X234Y
-    Sequence IDs should exactly match the fasta headers, as parsed by BioPython.
-    (">" symbols are optional)
-    """
-
-    with open(mapfile, "r") as handle:
-        mut_dict = OrderedDict()
-        for line in handle:
-            id, change = line.lstrip(">").rstrip("\n").split(",")
-            mut_dict[Mutation(id)] = change
-
-    for k, v in mut_dict.items():
-        assert v[0].isalpha(), (
-            "First character of mutation map is not a valid letter. Got: %s" % v[0]
-        )
-        assert v[-1].isalpha(), (
-            "Last character of mutation map is not a valid letter. Got: %s" % v[-1]
-        )
-        assert v[1:-1].isdigit(), (
-            "Location string of mutation map is not a valid number. Got: %s" % v[1:-1]
-        )
-
-    return mut_dict
-
-
-def morph(orig, loc, new, mutableseq, verbose):
-    """Perform actual sequence change (polymorphism only at present)"""
-    # Shift location to offset 0-based index
-    loc = loc - 1
-    assert mutableseq[loc] == orig, (
-        "Sequence does not match the mutation file for pre-exising residue. Expected %s , got %s "
-        % (orig, mutableseq[loc])
-    )
-    if verbose is True:
-        print(
-            "Performing change: {} -> {}, at location: {} (0 based)".format(
-                orig, new, loc
-            )
-        )
-    mutableseq[loc] = new
-    return mutableseq
-
-
-def hamming_distance(s1, s2):
-    """Return the Hamming distance between equal-length sequences"""
-    if len(s1) != len(s2):
-        raise ValueError("Undefined for sequences of unequal length")
-    return sum(ch1 != ch2 for ch1, ch2 in zip(s1.upper(), s2.upper()))
-
-
-def main():
-    args = get_args()
-    if args.outfile is not None:
-        ofh = open(args.outfile, "w")
-
-    # Parse the mutation file (get mutations by sequence)
-    mutations = parse_mapfile(args.mutation_file)
-    if args.verbose is True:
-        print("Got mutations:")
-        print(mutations)
-    # Iterate all sequences and make any substitutions necessary
-    for record in SeqIO.parse(args.sequences, "fasta"):
-        for k, v in mutations.items():
-            mutable = record.seq.upper().tomutable()
-#            mutable = record.seq.tomutable()
-#            print("MO:", mutable)
-            if k.name == record.id[0:4]: # BEWARE HARDCODING
-#                print("k.name:", k.name, "record.id:", record.id)
-                orig = v[0]
-                print("orig:", orig)
-                new = v[-1]
-                loc = int(v[1:-1])
-                if args.verbose:
-                    print(record.id)
-                newseq = morph(orig, loc, new, mutable, args.verbose)
-#                print("NS is:", newseq)
-                if args.verbose is True:
-                    print("Original: " + record.seq.upper())
-                    print(
-                        str((" " * int(loc - 2 + 11))) + "V"
-                    )  # Padded string to show where switch happened (not sure how it'll deal with line wrapping
-                    print("New:      " + newseq)
-                    print(
-                        "Distance: "
-                        + str(hamming_distance(str(record.seq), str(newseq)))
-                    )
-                if args.outfile is not None:
-                    ofh.write(">%s_%s\n%s\n" % (record.id, v, newseq))
-                if args.verbose is False:
-                    print(">%s_%s\n%s\n" % (record.id, v, newseq))
-
-    if args.outfile is not None:
-        ofh.close()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/meta_data_analysis/rd_df.py b/meta_data_analysis/rd_df.py
deleted file mode 100755
index 0074959..0000000
--- a/meta_data_analysis/rd_df.py
+++ /dev/null
@@ -1,135 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-'''
-Created on Tue Aug  6 12:56:03 2019
-
-@author: tanu
-'''
-#=============================================================================
-# Task: Residue depth (rd) processing to generate a df with residue_depth(rd)
-# values
-
-# FIXME
-# Input: '.tsv' i.e residue depth txt file (output from .zip file manually
-# downloaded from the website). 
-# This should be integrated into the pipeline
-
-# Output: .csv with 3 cols i.e position, rd_values & 3-letter wt aa code(caps)
-#=============================================================================
-#%% load packages
-import sys, os
-import pandas as pd
-#import numpy as np
-#=============================================================================
-#%% specify input and curr dir
-homedir = os.path.expanduser('~')
-
-# set working dir
-os.getcwd()
-os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
-os.getcwd()
-#=============================================================================
-#%% variable assignment: input and output 
-drug = 'pyrazinamide'
-gene = 'pncA'
-gene_match = gene + '_p.'
-
-#==========
-# data dir
-#==========
-#indir = 'git/Data/pyrazinamide/input/original'
-datadir = homedir + '/' + 'git/Data'
-
-#=======
-# input
-#=======
-#indir = 'git/Data/pyrazinamide/input/original'
-indir = datadir + '/' + drug + '/' + 'output'
-in_filename = '3pl1_rd.tsv'
-infile = indir + '/' + in_filename
-print('Input filename:', in_filename
-      , '\nInput path:', indir
-      , '\n=============================================================')
-#=======
-# output 
-#=======
-outdir =   datadir + '/' + drug + '/' + 'output'
-out_filename = gene.lower() + '_rd.csv'
-outfile =  outdir + '/' + out_filename
-print('Output filename:', out_filename
-      , '\nOutput path:', outdir
-      , '\n=============================================================')
-      
-#%% end of variable assignment for input and output files
-#=======================================================================
-#%%  Read input file
-rd_data  = pd.read_csv(infile, sep = '\t')  
-print('Reading input file:', infile
-      , '\nNo. of rows:', len(rd_data)
-      , '\nNo. of cols:', len(rd_data.columns))
-
-print('Column names:', rd_data.columns
-	, '\n===============================================================')
-#========================
-# creating position col
-#========================
-# Extracting residue number from index and assigning 
-# the values to a column [position]. Then convert the position col to numeric.
-rd_data['position'] = rd_data.index.str.extract('([0-9]+)').values
-
-# converting position to numeric      
-rd_data['position'] = pd.to_numeric(rd_data['position'])
-rd_data['position'].dtype
-
-print('Extracted residue num from index and assigned as a column:'
-      , '\ncolumn name: position'
-      , '\ntotal no. of cols now:', len(rd_data.columns)
-      , '\n=============================================================')
-
-#========================
-# Renaming amino-acid 
-# and all-atom cols
-#========================
-print('Renaming columns:'
-      , '\ncolname==> # chain:residue: wt_3letter_caps'
-      , '\nYES... the column name *actually* contains a # ..!'
-      , '\ncolname==> all-atom: rd_values'
-      , '\n=============================================================')
-
-rd_data.rename(columns = {'# chain:residue':'wt_3letter_caps', 'all-atom':'rd_values'}, inplace = True)
-print('Column names:', rd_data.columns)
-
-#========================
-# extracting df with the
-# desired columns
-#========================
-print('Extracting relevant columns for writing df as csv')
-
-rd_df = rd_data[['position','rd_values','wt_3letter_caps']]
-
-if len(rd_df) == len(rd_data):
-    print('PASS: extracted df has expected no. of rows'
-          ,'\nExtracted df dim:'
-          ,'\nNo. of rows:', len(rd_df)
-          ,'\nNo. of cols:', len(rd_df.columns))
-else:
-    print('FAIL: no. of rows mimatch'
-          , '\nExpected no. of rows:', len(rd_data)
-          , '\nGot no. of rows:', len(rd_df)
-          , '\n=========================================================')
-
-#%% write file
-print('Writing file:'
-      , '\nFilename:', out_filename
-      , '\nPath:',  outdir
-      , '\n=============================================================')
-
-rd_df.to_csv(outfile, header = True, index = False)
-
-print('Finished writing:', out_filename
-      , '\nNo. of rows:', len(rd_df)
-      , '\nNo. of cols:', len(rd_df.columns)
-      , '\n=============================================================')
-
-#%% end of script
-#=======================================================================
diff --git a/meta_data_analysis/run_mutate.sh b/meta_data_analysis/run_mutate.sh
deleted file mode 100755
index 1865a5e..0000000
--- a/meta_data_analysis/run_mutate.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/bash
-
-#https://www.biostars.org/p/336891/
-#python Mutate.py -v -o /path/to/output.fasta mutation_map_file.csv input.fasta
-
-# pnca_all_muts_msa_FIXME: This should be formatted like this from python script
-# change to a cmd script that takes this "prefix" as an input
-for i in $(cat pnca_all_muts_msa_FIXME.csv); do echo "3PL1,${i}"; done > pnca_copy.txt
-
-# make sure there is no new line at the end of the mutation file (snps.csv)
-#python3 Mutate.py -v -o /home/tanu/git/Data/pyrazinamide/input/output.fasta mut_map.csv 3pl1.fasta.txt
-python3 mutate.py -v -o /home/tanu/git/Data/pyrazinamide/output/pnca_msa.txt /home/tanu/git/Data/pyrazinamide/output/pnca_all_muts_msa.csv /home/tanu/git/Data/pyrazinamide/input/pnca_fasta.txt
-
-# remove fasta style header lines in the output i.e
-# lines beginning with '>' so the file is just the mutated seqs
-sed -i '/^>.*$/d' /home/tanu/git/Data/pyrazinamide/output/pnca_msa.txt
-printf 'No. of lines after cleaning: '
-cat /home/tanu/git/Data/pyrazinamide/output/pnca_msa.txt | wc -l
diff --git a/meta_data_analysis/run_pdb_dssp.sh b/meta_data_analysis/run_pdb_dssp.sh
deleted file mode 100755
index 1ad3b36..0000000
--- a/meta_data_analysis/run_pdb_dssp.sh
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/bin/bash
-#=======================================================================
-# Task: read a pdb file and generate a dssp output file
-# Input:
-# pdb file
-
-# Output:
-# pdb_code.dssp
-# needs dssp exe on linux
-# more efficient to run dssp exe locally
-
-# note: double quotes for variable interpolation
-#=======================================================================
-# #%%specify variables for input and output paths and filenames
-drug='pyrazinamide'
-gene='pncA'
-#convert to lowercase for consistency in filenames
-gene_l=$(printf $gene | awk '{print tolower($0)}') 
-gene_match="${gene}_p."
-#printf "${gene_match}\n"
-
-#==========
-# data dir
-#==========
-#indir = 'git/Data/pyrazinamide/input/original'
-#datadir="~git/Data"
-datadir="${HOME}/git/Data"
-
-#=======
-# input
-#=======
-indir=${datadir}'/'${drug}'/input'
-printf "Input dir: ${indir}\n"
-in_filename='3pl1.pdb'
-#infile=${basedir}${inpath}${in_filename}
-infile=${indir}'/'${in_filename}
-printf "Input file: ${infile}\n"
-
-#=======
-# output 
-#=======
-outdir=${datadir}'/'${drug}$'/output'
-printf "Output dir: ${outdir}"
-#out_filename='3pl1.dssp'
-out_filename="${gene_l}.dssp"
-outfile=${outdir}'/'${out_filename}
-printf "Output file: ${outfile}\n"
-
-#%%end of variable assignment for input and output files
-#================================================================
-# command line arg to run dssp and create output file
-dssp -i ${infile} -o ${outfile}
-printf "Finished writing: ${outfile}\n"
-#printf "Filename: ${out_filename}\nlocated in: ${outdir}\n"
diff --git a/plotting_test/logo_plot.R b/plotting_test/logo_plot.R
deleted file mode 100644
index be1d9d7..0000000
--- a/plotting_test/logo_plot.R
+++ /dev/null
@@ -1,221 +0,0 @@
-#=======================================================================
-# Task: To generate a logo plot  or bar plot but coloured 
-# aa properties.
-# step1: read mcsm file and OR file
-# step2: plot wild type positions
-# step3: plot mutants per position coloured by aa properties
-# step4: make the size of the letters/bars prop to OR if you can!
-
-# useful links
-# https://stackoverflow.com/questions/5438474/plotting-a-sequence-logo-using-ggplot2
-# https://omarwagih.github.io/ggseqlogo/
-# https://kkdey.github.io/Logolas-pages/workflow.html
-# A new sequence logo plot to highlight enrichment and depletion.
-#    https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6288878/
-
-#very good: http://www.cbs.dtu.dk/biotools/Seq2Logo-2.0/
-#=======================================================================
-#%% specify curr dir
-getwd()
-setwd('~/git/LSHTM_analysis/plotting_test/')
-getwd()
-#=======================================================================
-#%% load packages
-
-# header file
-header_dir = '~/git/LSHTM_analysis/'
-source(paste0(header_dir, '/', 'my_header.R'))
-#=======================================================================
-#%% variable assignment: input and output paths & filenames
-drug = 'pyrazinamide'
-gene = 'pncA'
-gene_match = paste0(gene,'_p.')
-cat(gene_match)
-
-#===========
-# data dir
-#===========
-datadir = paste0('~/git/Data')
-
-#===========
-# input
-#===========
-# source R script 'combining_two_df.R'
-#indir = paste0(datadir, '/', drug, '/', 'output') # reading files
-indir = '../meta_data_analysis' # sourcing R script
-in_filename = 'combining_df_ps.R'
-infile = paste0(indir, '/', in_filename)
-cat(paste0('Input is a R script: ', '\'', infile, '\'')
-    , '\n========================================================')
-
-#===========
-# output
-#===========
-# 1) lineage dist of all muts
-outdir = paste0('~/git/Data', '/', drug, '/', 'output/plots') #same as indir
-#cat('Output dir: ', outdir, '\n')
-#file_type = '.svg'
-#out_filename1 = paste0(tolower(gene), '_lineage_dist_ps', file_type) 
-#outfile1 = paste0(outdir, '/', out_filename1)
-#cat(paste0('Output plot1 :', outfile1)
-#    , '\n========================================================')
-
-#%% end of variable assignment for input and output files
-#=======================================================================
-##%% read input file
-cat('Reading input file(sourcing R script):', in_filename)
-
-source(infile)
-
-#==========================
-# This will return:
-
-# df with NA for pyrazinamide:
-# merged_df2
-# merged_df3
-
-# df without NA for pyrazinamide:
-# merged_df2_comp
-# merged_df3_comp
-#===========================
-
-###########################
-# Data for plots
-# you need merged_df2 or merged_df2_comp
-# since this is one-many relationship 
-# i.e the same SNP can belong to multiple lineages
-# using the _comp dataset means
-# we lose some muts and at this level, we should use
-# as much info as available, hence use df with NA
-
-# This will the first plotting df
-# Then subset this to extract dr muts only (second plottig df)
-###########################
-
-#%%%%%%%%%%%%%%%%%%%%%%%%%
-# uncomment as necessary
-# REASSIGNMENT
-#my_data = merged_df2
-#my_data =  merged_df2_comp
-#my_data = merged_df3
-my_data = merged_df3_comp
-#%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-# delete variables not required
-rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
-
-# quick checks
-colnames(my_data)
-str(my_data)
-
-c1 = unique(my_data$Position) 
-nrow(my_data)
-cat('No. of rows in my_data:', nrow(my_data)
-    , '\nDistinct positions corresponding to snps:', length(c1)
-    , '\n===========================================================')
-
-#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-# FIXME: Think and decide what you want to remove
-# mut_pos_occurence < 1 or sample_pos_occurrence <1
-
-# get freq count of positions so you can subset freq<1
-require(data.table)
-#setDT(my_data)[, mut_pos_occurrence := .N, by = .(Position)] #265, 14
-
-#extract freq_pos>1
-#my_data_snp = my_data[my_data$occurrence!=1,]
-
-#u = unique(my_data_snp$Position) #73
-#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-
-#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# REASSIGNMENT to prevent changing code
-my_data_snp = my_data
-#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-#=======================================================================
-#%% logo plots from dataframe
-
-#############
-# PLOTS: ggseqlogo with custom height
-# https://omarwagih.github.io/ggseqlogo/
-#############
-#require(ggplot2)
-#require(tidyverse)
-library(ggseqlogo)
-
-foo = my_data_snp[, c("Position", "Mutant_type","ratioDUET", "OR"
-                    , "mut_prop_polarity", "mut_prop_water") ] 
-
-# log10OR
-# FIXME: at the source script (when calculating AFandOR)
-my_data_snp$log10or = log10(my_data_snp$OR)
-bar = my_data_snp[, c('Position', 'Mutant_type', 'OR', 'logor', 'log10or')]
-
-
-bar_or = my_data_snp[, c('Position', 'Mutant_type', 'OR')]
-wide_df_or <- bar_or %>% spread(Position, OR, fill = 0)
-wide_df_or = as.matrix(wide_df_or)
-rownames(wide_df_or) = wide_df_or[,1]
-wide_df_or = wide_df_or[,-1]
-
-# custom height (OR) logo plot: yayy works
-ggseqlogo(wide_df_or, method='custom', seq_type='aa') + ylab('my custom height') +
-  theme(legend.position = "bottom"
-        , axis.text.x = element_text(size = 11
-                                     , angle = 90
-                                     , hjust = 1
-                                     , vjust = 0.4)
-        , axis.text.y = element_text(size = 15
-                                     , angle = 0
-                                     , hjust = 1
-                                     , vjust = 0))+
-  labs(title = "AA logo plot"
-       , x = "Wild-type Position"
-       , y = "OR")
-#%% end of logo plot with OR as height
-#=======================================================================
-# extracting data with log10OR
-bar_logor = my_data_snp[, c('Position', 'Mutant_type', 'log10or')]
-wide_df_logor <- bar_logor %>% spread(Position, log10or, fill = 0)
-
-wide_df_logor = as.matrix(wide_df_logor)
-rownames(wide_df_logor) = wide_df_logor[,1]
-wide_df_logor = wide_df_logor[,-1]
-
-#  custom height (log10OR) logo plot: yayy works
-ggseqlogo(wide_df_logor, method='custom', seq_type='aa') + ylab('my custom height') +
-  theme(legend.position = "bottom"
-        , axis.text.x = element_text(size = 11
-                                     , angle = 90
-                                     , hjust = 1
-                                     , vjust = 0.4)
-        , axis.text.y = element_text(size = 15
-                                     , angle = 0
-                                     , hjust = 1
-                                     , vjust = 0))+
-  labs(title = "AA logo plot"
-       , x = "Wild-type Position"
-       , y = "Log10(OR)")
-
-#=======================================================================
-#%% logo plot from sequence
-
-#################
-# Plot: LOGOLAS (ED plots)
-# link: https://github.com/kkdey/Logolas
-# on all pncA samples: output of mutate.py
-################
-library(Logolas)
-
-seqs = read.csv('~/git//Data/pyrazinamide/output/pnca_msa.txt'
-                , header = FALSE
-                , stringsAsFactors = FALSE)$V1
-
-
-# my_data: useful!
-logomaker(seqs, type = "EDLogo", color_type = 'per_symbol'
-          , return_heights = TRUE)
-logomaker(seqs, type = "Logo", color_type = 'per_symbol')
-
-#%% end of script
-#=======================================================================
\ No newline at end of file