updated gitignore to tidyup

2020-09-21 17:54:54 +01:00 · 2020-09-21 17:54:54 +01:00 · 2c013124ad
commit 2c013124ad
parent 1cf1f4e70e
14 changed files with 1 additions and 2677 deletions
--- a/.gitignore
+++ b/.gitignore
@ -6,6 +6,7 @@
 __pycache__
 */__pycache__
 mcsm_analysis_fixme
 meta_data_analysis
 del
 examples
 example
--- a/meta_data_analysis/AF_and_OR_calcs.R
+++ b/meta_data_analysis/AF_and_OR_calcs.R
@ -1,398 +0,0 @@
 #########################################################
 # TASK: To calculate Allele Frequency and
 # Odds Ratio from master data
 # and add the calculated params to meta_data extracted from
 # data_extraction.py
 #########################################################
 getwd()
 setwd('~/git/LSHTM_analysis/meta_data_analysis')
 getwd()
 #%% variable assignment: input and output paths & filenames
 drug = 'pyrazinamide'
 gene = 'pncA'
 gene_match = paste0(gene,'_p.')
 cat(gene_match)
 #===========
 # input
 #===========
 # infile1: Raw data
 #indir = 'git/Data/pyrazinamide/input/original'
 indir = paste0('~/git/Data')
 in_filename  = 'original_tanushree_data_v2.csv'
 infile = paste0(indir, '/', in_filename)
 cat(paste0('Reading infile1: raw data', ' ', infile) )
 # infile2: gene associated meta data file to extract valid snps and add calcs to.
 # This is outfile3 from data_extraction.py
 indir_metadata = paste0('~/git/Data', '/', drug, '/', 'output')
 in_filename_metadata = 'pnca_metadata.csv'
 infile_metadata = paste0(indir_metadata, '/', in_filename_metadata)
 cat(paste0('Reading infile2: gene associated metadata:', infile_metadata))
 #===========
 # output
 #===========
 # outdir = 'git/Data/pyrazinamide/output'
 outdir = paste0('~/git/Data', '/', drug, '/', 'output')
 out_filename = paste0(tolower(gene),'_', 'meta_data_with_AFandOR.csv')
 outfile = paste0(outdir, '/', out_filename)
 cat(paste0('Output file with full path:', outfile))
 #%% end of variable assignment for input and output files
 #########################################################
 # 1: Read master/raw data stored in Data/
 #########################################################
 raw_data_all = read.csv(infile, stringsAsFactors = F)
 raw_data = raw_data_all[,c("id"
                     , "pyrazinamide"
                     , "dr_mutations_pyrazinamide"
                     , "other_mutations_pyrazinamide")]
 rm(raw_data_all)
 rm(indir, in_filename, infile)
 #===========
 # 1a: exclude na
 #===========
 raw_data = raw_data[!is.na(raw_data$pyrazinamide),]
 total_samples = length(unique(raw_data$id))
 cat(paste0('Total samples without NA in', ' ', drug, 'is:', total_samples))
 # sanity check: should  be true
 is.numeric(total_samples) 
 #===========
 # 1b: combine the two mutation columns
 #===========
 raw_data$all_mutations_pyrazinamide = paste(raw_data$dr_mutations_pyrazinamide
                                            , raw_data$other_mutations_pyrazinamide)
 head(raw_data$all_mutations_pyrazinamide)
 #===========
 # 1c: create yet another column that contains all the mutations but in lower case
 #===========
 raw_data$all_muts_pnca = tolower(raw_data$all_mutations_pyrazinamide) 
 # sanity checks
 #table(grepl("pnca_p",raw_data$all_muts_pnca))
 cat(paste0('converting gene match:', gene_match, ' ', 'to lowercase'))
 gene_match = tolower(gene_match)
 table(grepl(gene_match,raw_data$all_muts_pnca))
 # sanity check: should be TRUE
 #sum(table(grepl("pnca_p",raw_data$all_muts_pnca))) == total_samples
 sum(table(grepl(gene_match,raw_data$all_muts_pnca))) == total_samples
 # set up variables: can be used for logistic regression as well
 i  = "pnca_p.ala134gly" # has a NA, should NOT exist
 table(grepl(i,raw_data$all_muts_pnca))
 i = "pnca_p.trp68gly"
 table(grepl(i,raw_data$all_muts_pnca))
 mut = grepl(i,raw_data$all_muts_pnca)
 dst = raw_data$pyrazinamide
 table(mut, dst)
 #chisq.test(table(mut,dst))
 #fisher.test(table(mut, dst))
 #table(mut)
 #########################################################
 # 2: Read valid snps for which OR 
 # can be calculated (infile_comp_snps.csv)
 #########################################################
 cat(paste0('Reading metadata infile:', infile_metadata))
 pnca_metadata = read.csv(infile_metadata
 #                        , file.choose()
                        , stringsAsFactors = F
                        , header = T)
 # clear variables
 rm(indir, in_filename, infile)
 rm(indir_metadata, in_filename_metadata, infile_metadata)
 # count na in pyrazinamide column
 tot_pza_na = sum(is.na(pnca_metadata$pyrazinamide))
 expected_rows = nrow(pnca_metadata) - tot_pza_na
 # drop na from the pyrazinamide colum
 pnca_snps_or = pnca_metadata[!is.na(pnca_metadata$pyrazinamide),]
 # sanity check
 if(nrow(pnca_snps_or) == expected_rows){
  cat('PASS: no. of rows match with expected_rows')
 } else{
  cat('FAIL: nrows mismatch.')
 }
 # extract unique snps to iterate over for AF and OR calcs
 pnca_snps_unique = unique(pnca_snps_or$mutation) 
 cat(paste0('Total no. of distinct comp snps to perform OR calcs: ', length(pnca_snps_unique)))
 # Define OR function
 x = as.numeric(mut)
 y = dst
 or = function(x,y){
  tab = as.matrix(table(x,y))
  a = tab[2,2]
  if (a==0){ a<-0.5}
  b = tab[2,1]
  if (b==0){ b<-0.5}
  c = tab[1,2]
  if (c==0){ c<-0.5}
  d = tab[1,1]
  if (d==0){ d<-0.5}
  (a/b)/(c/d)
  }
 dst = raw_data$pyrazinamide
 ors = sapply(pnca_snps_unique,function(m){
  mut = grepl(m,raw_data$all_muts_pnca)
  or(mut,dst)
 })
 ors
 pvals = sapply(pnca_snps_unique,function(m){
  mut = grepl(m,raw_data$all_muts_pnca)
  fisher.test(mut,dst)$p.value
 })
 pvals
 afs = sapply(pnca_snps_unique,function(m){
  mut = grepl(m,raw_data$all_muts_pnca)
  mean(mut)
 })
 afs
 # check ..hmmm
 afs['pnca_p.trp68gly']
 afs['pnca_p.gln10pro'] 
 afs['pnca_p.leu4ser'] 
 plot(density(log(ors)))
 plot(-log10(pvals))
 hist(log(ors)
     , breaks = 100
     )
 # sanity check
 if (table(names(ors) ==  names(pvals)) & table(names(ors) == names(afs)) & table(names(pvals) == names(afs)) == length(pnca_snps_unique)){
 cat('PASS: names of ors, pvals and afs match: proceed with combining into a single df')
 } else{
  cat('FAIL: names of ors, pvals and afs mismatch')
 }
 # combine ors, pvals and afs
 cat('Combining calculated params into a df: ors, pvals and afs')
 comb_AF_and_OR = data.frame(ors, pvals, afs)
 cat('No. of rows in comb_AF_and_OR: ', nrow(comb_AF_and_OR)
    , '\nNo. of cols in comb_AF_and_OR: ', ncol(comb_AF_and_OR))
 cat('Rownames == mutation: ', head(rownames(comb_AF_and_OR)))
 # add rownames of comb_AF_and_OR as an extra column 'mutation' to allow merging based on this column
 comb_AF_and_OR$mutation = rownames(comb_AF_and_OR)
 # sanity check
 if (table(rownames(comb_AF_and_OR) == comb_AF_and_OR$mutation)){
  cat('PASS: rownames and mutaion col values match')
 }else{
  cat('FAIL: rownames and mutation col values mismatch')
 }
 #########################################################
 # 3: Merge meta data file + calculated num params
 #########################################################
 df1 = pnca_metadata
 df2 = comb_AF_and_OR
 cat('checking commom col of the two dfs before merging:'
    ,'\ndf1:', head(df1$mutation)
    , '\ndf2:', head(df2$mutation))
 cat(paste0('merging two dfs: '
      ,'\ndf1 (big df i.e. meta data) nrows: ', nrow(df1)
      ,'\ndf2 (small df i.e af, or, pval) nrows: ', nrow(df2)
      ,'\nexpected rows in merged df: ', nrow(df1)
      ,'\nexpected cols in merged_df: ', (ncol(df1) + ncol(df2) - 1)))
 merged_df = merge(df1 # big file
                  , df2 # small (afor file)
                  , by = "mutation"
                  , all.x = T) # because you want all the entries of the meta data 
 # sanity check
 if(ncol(merged_df) == (ncol(df1) + ncol(df2) - 1)){
  cat(paste0('PASS: no. of cols is as expected: ', ncol(merged_df)))
 } else{
  cat('FAIL: no.of cols mistmatch')
 }
 # quick check
 i  = "pnca_p.ala134gly" # has all NAs in pyrazinamide, should be NA in ors, etc.
 merged_df[merged_df$mutation == i,]
 # count na in each column
 na_count = sapply(merged_df, function(y) sum(length(which(is.na(y))))); na_count
 # check last three cols: should be NA
 if ( identical(na_count[[length(na_count)]], na_count[[length(na_count)-1]], na_count[[length(na_count)-2]])){
  cat('PASS: No. of NAs for OR, AF and Pvals are equal as expected',
      '\nNo. of NA: ', na_count[[length(na_count)]])
 } else {
  cat('FAIL: No. of NAs for OR, AF and Pvals mismatch')
 }
 # reassign custom colnames
 cat('Assigning custom colnames for the calculated params...')
 colnames(merged_df)[colnames(merged_df)== "ors"] <- "OR"
 colnames(merged_df)[colnames(merged_df)== "pvals"] <- "pvalue"
 colnames(merged_df)[colnames(merged_df)== "afs"] <- "AF"
 colnames(merged_df)
 # add 3 more cols: log OR, neglog pvalue and AF_percent cols
 merged_df$logor = log(merged_df$OR)
 is.numeric(merged_df$logor)
 merged_df$neglog10pvalue = -log10(merged_df$pvalue) 
 is.numeric(merged_df$neglog10pvalue)
 merged_df$AF_percent = merged_df$AF*100
 is.numeric(merged_df$AF_percent)
 # check AFs
 #i = 'pnca_p.trp68gly'
 i = 'pnca_p.gln10pro' 
 #i = 'pnca_p.leu4ser'
 merged_df[merged_df$mutation == i,]
 # FIXME: harcoding (beware!), NOT FATAL though!
 ncol_added = 3 
 cat(paste0('Added', ' ', ncol_added, ' more cols to merged_df:'
          , '\ncols added: logor, neglog10pvalue and AF_percent:'
          , '\nno. of cols in merged_df now: ', ncol(merged_df)))
 #%% write file out: pnca_meta_data_with_AFandOR
 #*********************************************
 cat(paste0('writing output file: '
             , '\nFilename: ', out_filename
             , '\nPath:', outdir))
 write.csv(merged_df, outfile
          , row.names = F)
 cat(paste0('Finished writing:'
           , out_filename
           , '\nNo. of rows: ', nrow(merged_df)
           , '\nNo. of cols: ', ncol(merged_df)))
 #************************************************
 cat('======================================================================')
 rm(out_filename)
 cat('End of script: calculated AF, OR, pvalues and saved file')
 # End of script
 #%%
 # sanity check: Count NA in these four cols.
 # However these need to be numeric else these
 # will be misleading when counting NAs (i.e retrun 0)
 #is.numeric(meta_with_afor$OR)
 na_var = c('AF', 'OR', 'pvalue', 'logor', 'neglog10pvalue')
 # loop through these vars and check if these are numeric.
 # if not, then convert to numeric
 check_all = NULL
 for (i in na_var){
  #    cat(i)
  check0 = is.numeric(meta_with_afor[,i])
  if (check0) {
    check_all = c(check0, check_all)
    cat('These are all numeric cols')  
  } else{
    cat('First converting to numeric')
    check0 = as.numeric(meta_with_afor[,i])
    check_all = c(check0, check_all)
  }
 }
 # count na now that the respective cols are numeric
 na_count = sapply(meta_with_afor, function(y) sum(length(which(is.na(y))))); na_count
 str(na_count)
 # extract how many NAs: 
 # should be all TRUE 
 # should be a single number since 
 # all the cols should have 'equal' and 'same' no. of NAs
 # compare if the No of 'NA' are the same for all these cols
 na_len = NULL
 for (i in na_var){
  temp = na_count[[i]]
  na_len = c(na_len, temp)
 }
 cat('Checking how many NAs and if these are identical for the selected cols:')
 my_nrows = NULL
 for ( i in 1: (length(na_len)-1) ){
 # cat(compare(na_len[i]), na_len[i+1])
  c = compare(na_len[i], na_len[i+1])
  if ( c$result ) {
    cat('PASS: No. of NAa in selected cols are identical')
    my_nrows = na_len[i] }
  else { 
  cat('FAIL: No. of NAa in selected cols mismatch') 
  }
 }
 cat('No. of NAs in each of the selected cols: ', my_nrows)
 # yet more sanity checks:
 cat('Check whether the ', my_nrows, 'indices are indeed the same')
 #which(is.na(meta_with_afor$OR)) 
 # initialise an empty df with nrows as extracted above
 na_count_df = data.frame(matrix(vector(mode = 'numeric'
 #                                       , length = length(na_var)
                                       )
                                , nrow = my_nrows
 #                                , ncol = length(na_var)
                              ))
 # populate the df with the indices of the cols that are NA
 for (i in na_var){
  cat(i)
  na_i = which(is.na(meta_with_afor[i]))
  na_count_df = cbind(na_count_df, na_i)
  colnames(na_count_df)[which(na_var == i)] <- i
 }
 # Now compare these indices to ensure these are the same
 check2 = NULL
 for ( i in 1: ( length(na_count_df)-1 ) ) {
 #  cat(na_count_df[i] == na_count_df[i+1])
  check_all = identical(na_count_df[[i]], na_count_df[[i+1]])
  check2 = c(check_all, check2)
  if ( all(check2) ) {
    cat('PASS: The indices for AF, OR, etc are all the same\n')
  } else {
    cat ('FAIL: Please check indices which are NA')
  }
 }
--- a/meta_data_analysis/combine_dfs.py
+++ b/meta_data_analysis/combine_dfs.py
@ -1,248 +0,0 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 '''
 Created on Tue Aug  6 12:56:03 2019
@author: tanu
 '''
 # FIXME: change filename 4 (mcsm normalised data)
 # to be consistent like (pnca_complex_mcsm_norm.csv) : changed manually, but ensure this is done in the mcsm pipeline
 #=============================================================================
 # Task: combine 4 dfs with aa position as linking column
 # This is done in 2 steps:
 # merge 1: of 3 dfs (filenames in lowercase)
 #   <gene.lower()>_dssp.csv
 #   <gene.lower()>_kd.csv
 #   <gene.lower()>_pnca_rd.csv
 # merge 2: of 2 dfs
 #    pnca_complex_mcsm_norm.csv (!fix name in mcsm script)
 #    output df from merge1
 # Input: 3 dfs
 #   <gene.lower()>_dssp.csv
 #   <gene.lower()>_kd.csv
 #   <gene.lower()>_pnca_rd.csv
 # pnca_complex_mcsm_norm.csv (!fix name in mcsm script)
 # Output: .csv of all 4 dfs combined
 # useful link
 # https://stackoverflow.com/questions/23668427/pandas-three-way-joining-multiple-dataframes-on-columns
 #=============================================================================
 #%% load packages
 import sys, os
 import pandas as pd
 #import numpy as np
 import argparse
 #=======================================================================
 #%% specify input and curr dir
 homedir = os.path.expanduser('~')
 # set working dir
 os.getcwd()
 os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
 os.getcwd()
 #=======================================================================
 #%% command line args
 arg_parser = argparse.ArgumentParser()
 #arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide')
 #arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive
 arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazin')
 arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pn') # case sensitive
 args = arg_parser.parse_args()
 #=======================================================================
 #%% variable assignment: input and output 
 #drug = 'pyrazinamide'
 #gene = 'pncA'
 #gene_match = gene + '_p.'
 drug = args.drug
 gene = args.gene
 #==========
 # data dir
 #==========
 datadir = homedir + '/' + 'git/Data'
 #=======
 # input
 #=======
 indir = datadir + '/' + drug + '/' + 'output'
 in_filename1 = 'pnca_dssp.csv'
 in_filename2 = 'pnca_kd.csv'
 in_filename3 = 'pnca_rd.csv'
 #in_filename4 = 'mcsm_complex1_normalised.csv' # Fix name in mcsm script
 in_filename4 = 'pnca_complex_mcsm_norm.csv'  # manually changed temporarily
 infile1 = indir + '/' + in_filename1
 infile2 = indir + '/' + in_filename2
 infile3 = indir + '/' + in_filename3
 infile4 = indir + '/' + in_filename4
 print('\nInput path:', indir
      , '\nInput filename1:', in_filename1
      , '\nInput filename2:', in_filename2
      , '\nInput filename3:', in_filename3
      , '\nInput filename4:', in_filename4
      , '\n===================================================================')
 #=======
 # output 
 #=======
 outdir =   datadir + '/' + drug + '/' + 'output'
 out_filename = gene.lower() + '_mcsm_struct_params.csv'
 outfile =  outdir + '/' + out_filename
 print('Output filename:', out_filename
      , '\nOutput path:', outdir
      , '\n===================================================================')
 #%% end of variable assignment for input and output files
 #=======================================================================
 #%%  Read input file
 dssp_df  = pd.read_csv(infile1, sep = ',')  
 kd_df  = pd.read_csv(infile2, sep = ',')  
 rd_df  = pd.read_csv(infile3, sep = ',')
 mcsm_df  = pd.read_csv(infile4, sep = ',')
 print('Reading input files:'
      , '\ndssp file:', infile1
      , '\nNo. of rows:', len(dssp_df)
      , '\nNo. of cols:', len(dssp_df.columns)
      , '\nColumn names:', dssp_df.columns
      , '\n==================================================================='
      , '\nkd file:', infile2
      , '\nNo. of rows:', len(kd_df)
      , '\nNo. of cols:', len(kd_df.columns)
      , '\nColumn names:', kd_df.columns
      , '\n==================================================================='
      , '\nrd file:', infile3
      , '\nNo. of rows:', len(rd_df)
      , '\nNo. of cols:', len(rd_df.columns)
      , '\nColumn names:', rd_df.columns
      , '\n==================================================================='
      , '\nrd file:', infile4
      , '\nNo. of rows:', len(mcsm_df)
      , '\nNo. of cols:', len(mcsm_df.columns)
      , '\nColumn names:', mcsm_df.columns
      , '\n===================================================================')
 #%% Begin combining dfs
 #===================
 # concatenating df1 (3dfs): dssp_df + kd_df+ rd_df
 #===================
 print('starting first merge...\n')
 # checking no. of rows
 print('Checking if no. of rows of the 3 dfs are equal:\n'
      , len(dssp_df) == len(kd_df) == len(rd_df)
      , '\nReason: fasta files and pdb files vary since not all pos are part of the structure'
      , '\n===================================================================')
 # variables for sanity checks
 expected_rows_df1 = max(len(dssp_df), len(kd_df), len(rd_df))
 # beware of harcoding! used for sanity check
 ndfs = 3
 ncol_merge = 1
 offset = ndfs- ncol_merge
 expected_cols_df1 = len(dssp_df.columns) + len(kd_df.columns) + len(rd_df.columns) - offset
 print('Merge 1:'
      , '\ncombining 3dfs by commom col: position'
      , '\nExpected nrows in combined_df:', expected_rows_df1
      , '\nExpected ncols in combined_df:', expected_cols_df1
      , '\nResetting the common col as the index'
      , '\n===================================================================')
 #dssp_df.set_index('position', inplace = True)
 #kd_df.set_index('position', inplace = True)
 #rd_df.set_index('position', inplace =True)
 #combined_df = pd.concat([dssp_df, kd_df, rd_df], axis = 1, sort = False).reset_index()
 #combined_df.rename(columns = {'index':'position'})
 combined_df1  = pd.concat(
    (my_index.set_index('position') for my_index in [dssp_df, kd_df, rd_df])
    , axis = 1, join = 'outer').reset_index()
 # sanity check
 print('Checking dimensions of concatenated df1...')
 if len(combined_df1) == expected_rows_df1 and len(combined_df1.columns) == expected_cols_df1:
    print('PASS: combined df has expected dimensions'
          , '\nNo. of rows in combined df:', len(combined_df1)
          , '\nNo. of cols in combined df:', len(combined_df1.columns)
          , '\n===============================================================')
 else:
    print('FAIL: combined df does not have expected dimensions'
          , '\nNo. of rows in combined df:', len(combined_df1)
          , '\nNo. of cols in combined df:', len(combined_df1.columns)
          , '\n===============================================================')
 #===================
 # concatenating df2 (2dfs): mcsm_df + combined_df1
 # sort sorts the cols
 #===================
 print('starting second merge...\n')
 # rename col 'Position' in mcsm_df to lowercase 'position'
 # as it matches the combined_df1 colname to perfom merge
 #mcsm_df.columns
 #mcsm_df.rename(columns = {'Position':'position'}) # not working!
 # copy 'Position' column with the correct colname 
 print('Firstly, copying \'Position\' col and renaming \'position\' to allow merging'
      , '\nNo. of cols before copying: ', len(mcsm_df.columns))
 mcsm_df['position'] = mcsm_df['Position']
 print('No. of cols after copying: ', len(mcsm_df.columns))
 # sanity check
 if mcsm_df['position'].equals(mcsm_df['Position']):
    print('PASS: Copying worked correctly'
          , '\ncopied col matches original column'
          , '\n===============================================================')
 else:
    print('FAIL: copied col does not match original column'
         , '\n================================================================')
 # variables for sanity checks
 expected_rows_df2 = len(mcsm_df)
 # beware of harcoding! used for sanity check
 ndfs = 2
 ncol_merge = 1
 offset = ndfs - ncol_merge
 expected_cols_df2 = len(mcsm_df.columns) + len(combined_df1.columns) - offset
 print('Merge 2:'
      , '\ncombining 2dfs by commom col: position'
      , '\nExpected nrows in combined_df:', expected_rows_df2
      , '\nExpected ncols in combined_df:', expected_cols_df2
      , '\n===================================================================')
 combined_df2 = mcsm_df.merge(combined_df1, on = 'position')
 # sanity check
 print('Checking dimensions of concatenated df2...')
 if len(combined_df2) == expected_rows_df2 and len(combined_df2.columns) == expected_cols_df2:
    print('PASS: combined df2 has expected dimensions'
          , '\nNo. of rows in combined df:', len(combined_df2)
          , '\nNo. of cols in combined df:', len(combined_df2.columns)
          , '\n===============================================================')
 else:
    print('FAIL: combined df2 does not have expected dimensions'
          , '\nNo. of rows in combined df:', len(combined_df2)
          , '\nNo. of cols in combined df:', len(combined_df2.columns)
          , '\n===============================================================')
 #%% write file
 print('Writing file:'
      , '\nFilename:', out_filename
      , '\nPath:',  outdir
      , '\n===================================================================')
 combined_df2.to_csv(outfile, header = True, index = False)
 print('Finished writing:', out_filename
      , '\nNo. of rows:', len(combined_df2)
      , '\nNo. of cols:', len(combined_df2.columns)
      , '\n===================================================================')
 #%% end of script
 #==============================================================================
--- a/meta_data_analysis/combining_df_lig.R
+++ b/meta_data_analysis/combining_df_lig.R
@ -1,397 +0,0 @@
 #=======================================================================
 # TASK: To combine mcsm and meta data with af and or
 # by filtering for distance to ligand (<10Ang).
 # This script doesn't output anything.
 # This script is sourced from other .R scripts for plotting ligand plots
 # Input csv files:
 # 1) mcsm normalised and struct params
 # 2) gene associated meta_data_with_AFandOR
 #=======================================================================
 #%% specify curr dir
 getwd()
 setwd('~/git/LSHTM_analysis/meta_data_analysis/')
 getwd()
 #=======================================================================
 #%% load packages			 
 #require(data.table)
 #require(arsenal)
 #require(compare)
 #library(tidyverse)
 # header file
 header_dir = '~/git/LSHTM_analysis/'
 source(paste0(header_dir, '/', 'my_header.R'))
 #%% variable assignment: input and output paths & filenames
 #=======================================================================
 drug = 'pyrazinamide'
 gene = 'pncA'
 gene_match = paste0(gene,'_p.')
 cat(gene_match)
 #===========
 # data dir
 #===========
 datadir = '~/git/Data'
 #===========
 # input
 #===========
 # infile1: mCSM data
 #indir = '~/git/Data/pyrazinamide/input/processed/'
 indir = paste0(datadir, '/', drug, '/', 'output') # revised {TODO: change in mcsm pipeline}
 #in_filename = 'mcsm_complex1_normalised.csv'
 in_filename = 'pnca_mcsm_struct_params.csv' 
 infile = paste0(indir, '/', in_filename)
 cat(paste0('Reading infile1: mCSM output file', ' ', infile) )
 # infile2: gene associated meta data combined with AF and OR
 #indir: same as above
 in_filename_comb = paste0(tolower(gene),  '_meta_data_with_AFandOR.csv')
 infile_comb = paste0(indir, '/', in_filename_comb)
 cat(paste0('Reading infile2: gene associated combined metadata:', infile_comb))
 #===========
 # output
 #===========
 # Uncomment if and when required to output
 outdir = paste0(datadir, '/', drug, '/', 'output') #same as indir
 cat('Output dir: ', outdir, '\n')
 #out_filename = paste0(tolower(gene), 'XXX')
 #outfile = paste0(outdir, '/', out_filename)
 #cat(paste0('Output file with full path:', outfile))
 #%% end of variable assignment for input and output files
 #=======================================================================
 #%% Read input files
 #####################################
 # input file 1: mcsm normalised data
 # output of step 4 mcsm_pipeline
 #####################################
 cat('Reading mcsm_data:'
    , '\nindir: ', indir
    , '\ninfile_comb: ', in_filename)
 mcsm_data = read.csv(infile
 #                     , row.names = 1
                     , stringsAsFactors = F
                     , header = T)
 cat('Read mcsm_data file:'
    , '\nNo.of rows: ', nrow(mcsm_data)
    , '\nNo. of cols:', ncol(mcsm_data))
 # clear variables
 rm(in_filename, infile)
 str(mcsm_data)
 table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
 # spelling Correction 1: DUET
 mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising'
 mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising'
 # checks: should be the same as above
 table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
 head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome)
 # spelling Correction 2: Ligand
 table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
 mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising'
 mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising'
 # checks: should be the same as above
 table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
 head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome)
 # muts with opposing effects on protomer and ligand stability
 # excluded from here as it is redundant. 
 # check 'combining_two_df.R' to refer if required.
 #=======================================================================
 #%% !!!Filter data only for mcsm lig!!!
 ########################### 
 # Filter/subset data 
 # Lig plots < 10Ang
 # Filter the lig plots for Dis_to_lig < 10Ang
 ###########################
 # check range of distances
 max(mcsm_data$Dis_lig_Ang)
 min(mcsm_data$Dis_lig_Ang)
 # count
 table(mcsm_data$Dis_lig_Ang<10)
 # subset data to have only values less than 10 Ang
 mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10)
 # sanity checks
 max(mcsm_data2$Dis_lig_Ang)
 min(mcsm_data2$Dis_lig_Ang)
 # count no of unique positions
 length(unique(mcsm_data2$Position))
 # count no of unique mutations
 length(unique(mcsm_data2$Mutationinformation))
 # count Destabilisinga and stabilising
 table(mcsm_data2$Lig_outcome) #{RESULT: no of mutations within 10Ang}
 #############################
 # Extra sanity check:
 # for mcsm_lig ONLY
 # Dis_lig_Ang should be <10
 #############################
 if (max(mcsm_data2$Dis_lig_Ang) < 10){
  print ("Sanity check passed: lig data is <10Ang")
 }else{
  print ("Error: data should be filtered to be within 10Ang")
 }
 #!!!!!!!!!!!!!!!!!!!!!
 # REASSIGNMENT: so as not to alter the script
 mcsm_data = mcsm_data2
 #!!!!!!!!!!!!!!!!!!!!!
 #=======================================================================
 # clear variables
 rm(mcsm_data2)
 # count na in each column
 na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count
 # sort by Mutationinformation
 mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),]
 head(mcsm_data$Mutationinformation)
 orig_col = ncol(mcsm_data)
 # get freq count of positions and add to the df
 setDT(mcsm_data)[, mut_pos_occurrence := .N, by = .(position)] 
 cat('Added col: position frequency to see which posn has how many muts'
    , '\nNo. of cols now', ncol(mcsm_data)
    , '\nNo. of cols before: ', orig_col)
 mut_pos_occurrence = data.frame(mcsm_data$id, mcsm_data$Position, mcsm_data$mut_pos_occurrence)
 ######################################
 # input file2 meta data with AFandOR
 ######################################
 cat('Reading combined meta data and AFandOR file:'
    , '\nindir: ', indir
    , '\ninfile_comb: ', in_filename_comb)
 meta_with_afor <- read.csv(infile_comb
                      , stringsAsFactors = F
                      , header = T)
 cat('Read mcsm_data file:'
    , '\nNo.of rows: ', nrow(meta_with_afor)
    , '\nNo. of cols:', ncol(meta_with_afor))
 # clear variables
 rm(in_filename_comb, infile_comb)
 str(meta_with_afor)
 # sort by Mutationinformation
 head(meta_with_afor$Mutationinformation)
 meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),]
 head(meta_with_afor$Mutationinformation)
 orig_col2 = ncol(meta_with_afor)
 # get freq count of positions and add to the df
 setDT(meta_with_afor)[, sample_pos_occurrence := .N, by = .(Position)] 
 cat('Added col: position frequency of samples to check'
 	,'how many samples correspond to a partiulcar posn associated with muts'
    , '\nNo. of cols now', ncol(meta_with_afor)
    , '\nNo. of cols before: ', orig_col2)
 sample_pos_occurrence = data.frame(meta_with_afor$id, meta_with_afor$position, meta_with_afor$sample_pos_occurrence)
 #=======================================================================
 cat('Begin merging dfs with NAs'
 	, '\n===============================================================')
 ###########################
 # merging two dfs: with NA
 ###########################
 # link col name  = 'Mutationinforamtion'
 cat('Merging dfs with NAs: big df (1-many relationship b/w id & mut)'
    ,'\nlinking col: Mutationinforamtion'
    ,'\nfilename: merged_df2')
 head(mcsm_data$Mutationinformation)
 head(meta_with_afor$Mutationinformation)
 #########
 # a) merged_df2: meta data with mcsm
 #########
 merged_df2 = merge(x = meta_with_afor
                  , y = mcsm_data
                  , by = 'Mutationinformation'
                  , all.y = T)
 cat('Dim of merged_df2: '
    , '\nNo. of rows: ', nrow(merged_df2)
    , '\nNo. of cols: ', ncol(merged_df2))
 head(merged_df2$Position)
 if(nrow(meta_with_afor) == nrow(merged_df2)){
  cat('nrow(merged_df2) = nrow (gene associated metadata)'
      ,'\nExpected no. of rows: ',nrow(meta_with_afor) 
      ,'\nGot no. of rows: ', nrow(merged_df2))
 } else{
  cat('nrow(merged_df2)!= nrow(gene associated metadata)'
      , '\nExpected no. of rows after merge: ', nrow(meta_with_afor)
      , '\nGot no. of rows: ', nrow(merged_df2)
      , '\nFinding discrepancy')
  merged_muts_u = unique(merged_df2$Mutationinformation)
  meta_muts_u = unique(meta_with_afor$Mutationinformation)
    # find the index where it differs
  unique(meta_muts_u[! meta_muts_u %in% merged_muts_u])
 }
 # sort by Position
 head(merged_df2$Position)
 merged_df2 = merged_df2[order(merged_df2$Position),]
 head(merged_df2$Position)
 merged_df2v2 = merge(x = meta_with_afor
                   , y = mcsm_data
                   , by = 'Mutationinformation'
                   , all.x = T) 
 #!=!=!=!=!=!=!=!
 # COMMENT: used all.y since position 186 is not part of the struc,
 # hence doesn't have a mcsm value
 # but 186 is associated with mutation
 #!=!=!=!=!=!=!=!
 # should  be False
 identical(merged_df2, merged_df2v2)
 table(merged_df2$Position%in%merged_df2v2$Position)
 rm(merged_df2v2)
 #########
 # b) merged_df3: remove duplicate mutation information
 #########
 cat('Merging dfs with NAs: small df (removing duplicate muts)'
    ,'\nCannot trust lineage info from this'
    ,'\nlinking col: Mutationinforamtion'
    ,'\nfilename: merged_df3')
 #==#=#=#=#=#=#
 # Cannot trust lineage, country from this df as the same mutation
 # can have many different lineages
 # but this should be good for the numerical corr plots
 #=#=#=#=#=#=#=
 merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),] 
 head(merged_df3$Position); tail(merged_df3$Position) # should be sorted
 # sanity checks
 # nrows of merged_df3 should be the same as the nrows of mcsm_data
 if(nrow(mcsm_data) == nrow(merged_df3)){
  cat('PASS: No. of rows match with mcsm_data'
      ,'\nExpected no. of rows: ', nrow(mcsm_data)
      ,'\nGot no. of rows: ', nrow(merged_df3))
 } else {
  cat('FAIL: No. of rows mismatch'
      , '\nNo. of rows mcsm_data: ', nrow(mcsm_data)
      , '\nNo. of rows merged_df3: ', nrow(merged_df3))
 }
 # counting NAs in AF, OR cols in merged_df3
 # this is becuase mcsm has no AF, OR cols,
 # so you cannot count NAs
 if (identical(sum(is.na(merged_df3$OR))
              , sum(is.na(merged_df3$pvalue))
              , sum(is.na(merged_df3$AF)))){
  cat('PASS: NA count match for OR, pvalue and AF\n')
  na_count_df3 = sum(is.na(merged_df3$AF))
  cat('No. of NAs: ', sum(is.na(merged_df3$OR)))
 } else{
  cat('FAIL: NA count mismatch'
      , '\nNA in OR: ', sum(is.na(merged_df3$OR))
      , '\nNA in pvalue: ', sum(is.na(merged_df3$pvalue))
      , '\nNA in AF:', sum(is.na(merged_df3$AF)))
 }
 #=======================================================================
 #%% merging without NAs
 cat('Begin merging dfs without NAs'
 	, '\n===============================================================')
 cat('Merging dfs without any NAs: big df (1-many relationship b/w id & mut)'
    ,'\nlinking col: Mutationinforamtion'
    ,'\nfilename: merged_df2_comp')
 #########
 # c) merged_df2_comp: merging two dfs without NA
 #########
 merged_df2_comp = merged_df2[!is.na(merged_df2$AF),] 
 #merged_df2_comp = merged_df2[!duplicated(merged_df2$Mutationinformation),] 
 # sanity check
 cat('Checking nrows in merged_df2_comp')
 if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count + 1)){
  cat('PASS: No. of rows match'
      ,'\nDim of merged_df2_comp: '
      ,'\nExpected no. of rows: ', nrow(merged_df2) - na_count + 1
      , '\nNo. of rows: ', nrow(merged_df2_comp)
      , '\nNo. of cols: ', ncol(merged_df2_comp))
 }else{
  cat('FAIL: No. of rows mismatch'
      ,'\nExpected no. of rows: ', nrow(merged_df2) - na_count + 1
      ,'\nGot no. of rows: ', nrow(merged_df2_comp))
 }
 #########
 # d) merged_df3_comp: remove duplicate mutation information
 #########
 merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),] 
 cat('Dim of merged_df3_comp: '
    , '\nNo. of rows: ', nrow(merged_df3_comp)
    , '\nNo. of cols: ', ncol(merged_df3_comp))
 # alternate way of deriving merged_df3_comp
 foo = merged_df3[!is.na(merged_df3$AF),]
 # compare dfs: foo and merged_df3_com
 all.equal(foo, merged_df3)
 summary(comparedf(foo, merged_df3))
 # sanity check
 cat('Checking nrows in merged_df3_comp')
 if(nrow(merged_df3_comp) == nrow(merged_df3)){
  cat('NO NAs detected in merged_df3 in AF|OR cols'
      ,'\nNo. of rows are identical: ', nrow(merged_df3))
 } else{
  if(nrow(merged_df3_comp) == nrow(merged_df3) - na_count_df3) {
  cat('PASS: NAs detected in merged_df3 in AF|OR cols'
      , '\nNo. of NAs: ', na_count_df3
      , '\nExpected no. of rows in merged_df3_comp: ', nrow(merged_df3) - na_count_df3
      , '\nGot no. of rows: ', nrow(merged_df3_comp))
  }
 }
 #======================================================================= 
 # write_output files
 # Not required as this is a subset of the combining_two_df.R
 #*************************
 # clear variables
 rm(mcsm_data, meta_with_afor, foo, drug, gene, gene_match, indir, merged_muts_u, meta_muts_u, na_count, orig_col, outdir)
 rm(mut_pos_occurrence)
 #%% end of script
 #=======================================================================
--- a/meta_data_analysis/combining_df_ps.R
+++ b/meta_data_analysis/combining_df_ps.R
@ -1,461 +0,0 @@
 #=======================================================================
 # TASK: To combine mcsm and meta data with af and or files  
 # Input csv files:
 # 1) mcsm normalised and struct params
 # 2) gene associated meta_data_with_AFandOR
 # Output: 
 # 1) muts with opposite effects on stability
 # 2) large combined df including NAs for AF, OR,etc
 # 		Dim: same no. of rows as gene associated meta_data_with_AFandOR
 # 3) small combined df including NAs for AF, OR, etc.
 # 		Dim: same as mcsm data
 # 4) large combined df excluding NAs 
 # 		Dim: dim(#1) - no. of NAs(AF|OR) + 1
 # 5) small combined df excluding NAs
 # 		Dim: dim(#2) - no. of unique NAs - 1
 # This script is sourced from other .R scripts for plotting
 #=======================================================================
 #%% specify curr dir
 getwd()
 setwd('~/git/LSHTM_analysis/meta_data_analysis/')
 getwd()
 #=======================================================================
 #%% load packages
 #require(data.table)
 #require(arsenal)
 #require(compare)
 #library(tidyverse)
 # header file
 header_dir = '~/git/LSHTM_analysis/'
 source(paste0(header_dir, '/', 'my_header.R'))
 #=======================================================================
 #%% variable assignment: input and output paths & filenames
 drug = 'pyrazinamide'
 gene = 'pncA'
 gene_match = paste0(gene,'_p.')
 cat(gene_match)
 #===========
 # data dir
 #===========
 datadir = '~/git/Data'
 #===========
 # input
 #===========
 # infile1: mCSM data
 #indir = '~/git/Data/pyrazinamide/input/processed/'
 indir = paste0(datadir, '/', drug, '/', 'output') # revised {TODO: change in mcsm pipeline}
 #in_filename = 'mcsm_complex1_normalised.csv'
 in_filename = 'pnca_mcsm_struct_params.csv' 
 infile = paste0(indir, '/', in_filename)
 cat(paste0('Reading infile1: mCSM output file', ' ', infile, '\n') )
 # infile2: gene associated meta data combined with AF and OR
 #indir: same as above
 in_filename_comb = paste0(tolower(gene),  '_meta_data_with_AFandOR.csv')
 infile_comb = paste0(indir, '/', in_filename_comb)
 cat(paste0('Reading infile2: gene associated combined metadata:', infile_comb, '\n'))
 #===========
 # output
 #===========
 # Uncomment if and when required to output
 outdir = paste0(datadir, '/', drug, '/', 'output') #same as indir
 cat('Output dir: ', outdir, '\n')
 #out_filename = paste0(tolower(gene), 'XXX')
 #outfile = paste0(outdir, '/', out_filename)
 #cat(paste0('Output file with full path:', outfile))
 #%% end of variable assignment for input and output files
 #=======================================================================
 #%% Read input files
 #####################################
 # input file 1: mcsm normalised data
 # output of step 4 mcsm_pipeline
 #####################################
 cat('Reading mcsm_data:'
    , '\nindir: ', indir
    , '\ninfile_comb: ', in_filename)
 mcsm_data = read.csv(infile
 #                     , row.names = 1
                     , stringsAsFactors = F
                     , header = T)
 cat('Read mcsm_data file:'
    , '\nNo.of rows: ', nrow(mcsm_data)
    , '\nNo. of cols:', ncol(mcsm_data))
 # clear variables
 rm(in_filename, infile)
 str(mcsm_data)
 table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
 # spelling Correction 1: DUET
 mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising'
 mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising'
 # checks: should be the same as above
 table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
 head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome)
 # spelling Correction 2: Ligand
 table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
 mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising'
 mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising'
 # checks: should be the same as above
 table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
 head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome)
 # muts with opposing effects on protomer and ligand stability
 table(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome)
 changes = mcsm_data[which(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome),]
 # sanity check: redundant, but uber cautious!
 dl_i = which(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome)
 ld_i = which(mcsm_data$Lig_outcome != mcsm_data$DUET_outcome)
 cat('Identifying muts with opposite stability effects')
 if(nrow(changes) == (table(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome)[[2]]) & identical(dl_i,ld_i)) {
  cat('PASS: muts with opposite effects on stability and affinity correctly identified'
        , '\nNo. of such muts: ', nrow(changes))
 }else {
  cat('FAIL: unsuccessful in extracting muts with changed stability effects')
 }
 #***************************
 # write file: changed muts
 out_filename = 'muts_opp_effects.csv'
 outfile = paste0(outdir, '/', out_filename)
 cat('Writing file for muts with opp effects:'
    , '\nFilename: ', outfile
    , '\nPath: ', outdir)
 write.csv(changes, outfile)
 #****************************
 # clear variables
 rm(out_filename, outfile)
 rm(changes, dl_i, ld_i)
 # count na in each column
 na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count
 # sort by Mutationinformation
 mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),]
 head(mcsm_data$Mutationinformation)
 orig_col = ncol(mcsm_data)
 # get freq count of positions and add to the df
 setDT(mcsm_data)[, mut_pos_occurrence := .N, by = .(Position)] 
 cat('Added col: position frequency of muts to see which posn has how many muts'
    , '\nNo. of cols now', ncol(mcsm_data)
    , '\nNo. of cols before: ', orig_col)
 mut_pos_occurrence = data.frame(mcsm_data$Mutationinformation
                                , mcsm_data$Position
                                , mcsm_data$mut_pos_occurrence)
 colnames(mut_pos_occurrence) = c('Mutationinformation', 'position', 'mut_pos_occurrence')
 #######################################
 # input file 2: meta data with AFandOR
 #######################################
 cat('Reading combined meta data and AFandOR file:'
    , '\nindir: ', indir
    , '\ninfile_comb: ', in_filename_comb)
 meta_with_afor <- read.csv(infile_comb
                      , stringsAsFactors = F
                      , header = T)
 cat('Read mcsm_data file:'
    , '\nNo.of rows: ', nrow(meta_with_afor)
    , '\nNo. of cols:', ncol(meta_with_afor))
 # counting NAs in AF, OR cols
 if (identical(sum(is.na(meta_with_afor$OR))
              , sum(is.na(meta_with_afor$pvalue))
              , sum(is.na(meta_with_afor$AF)))){
  cat('PASS: NA count match for OR, pvalue and AF\n')
  na_count = sum(is.na(meta_with_afor$AF))
  cat('No. of NAs: ', sum(is.na(meta_with_afor$OR)))
 } else{
  cat('FAIL: NA count mismatch'
      , '\nNA in OR: ', sum(is.na(meta_with_afor$OR))
      , '\nNA in pvalue: ', sum(is.na(meta_with_afor$pvalue))
      , '\nNA in AF:', sum(is.na(meta_with_afor$AF)))
 }
 # clear variables
 rm(in_filename_comb, infile_comb)
 str(meta_with_afor)
 # sort by Mutationinformation
 head(meta_with_afor$Mutationinformation)
 meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),]
 head(meta_with_afor$Mutationinformation)
 orig_col2 = ncol(meta_with_afor)
 # get freq count of positions and add to the df
 setDT(meta_with_afor)[, sample_pos_occurrence := .N, by = .(position)] 
 cat('Added col: position frequency of samples to check'
 	,'how many samples correspond to a partiulcar posn associated with muts'
    , '\nNo. of cols now', ncol(meta_with_afor)
    , '\nNo. of cols before: ', orig_col2)
 sample_pos_occurrence = data.frame(meta_with_afor$id
                                   , meta_with_afor$mutation
                                   , meta_with_afor$Mutationinformation
                                   , meta_with_afor$position
                                   , meta_with_afor$sample_pos_occurrence)
 colnames(sample_pos_occurrence) = c('id', 'mutation', 'Mutationinformation', 'position', 'sample_pos_occurrence')
 #=======================================================================
 cat('Begin merging dfs with NAs'
 	, '\n===============================================================')
 ###########################
 # merging two dfs: with NA
 ###########################
 # link col name  = 'Mutationinforamtion'
 head(mcsm_data$Mutationinformation)
 head(meta_with_afor$Mutationinformation)
 cat('Merging dfs with NAs: big df (1-many relationship b/w id & mut)'
    ,'\nlinking col: Mutationinforamtion'
    ,'\nfilename: merged_df2')
 #########
 # a) merged_df2: meta data with mcsm
 #########
 merged_df2 = merge(x = meta_with_afor
                  ,y = mcsm_data
                  , by = 'Mutationinformation'
                  , all.y = T)
 cat('Dim of merged_df2: '
    , '\nNo. of rows: ', nrow(merged_df2)
    , '\nNo. of cols: ', ncol(merged_df2))
 head(merged_df2$Position)
 # sanity check
 cat('Checking nrows in merged_df2')
 if(nrow(meta_with_afor) == nrow(merged_df2)){
  cat('nrow(merged_df2) = nrow (gene associated metadata)'
      ,'\nExpected no. of rows: ',nrow(meta_with_afor) 
      ,'\nGot no. of rows: ', nrow(merged_df2))
 } else{
  cat('nrow(merged_df2)!= nrow(gene associated metadata)'
      , '\nExpected no. of rows after merge: ', nrow(meta_with_afor)
      , '\nGot no. of rows: ', nrow(merged_df2)
      , '\nFinding discrepancy')
  merged_muts_u = unique(merged_df2$Mutationinformation)
  meta_muts_u = unique(meta_with_afor$Mutationinformation)
    # find the index where it differs
  unique(meta_muts_u[! meta_muts_u %in% merged_muts_u])
 }
 # sort by Position
 head(merged_df2$Position)
 merged_df2 = merged_df2[order(merged_df2$Position),]
 head(merged_df2$Position)
 merged_df2v2 = merge(x = meta_with_afor
                   ,y = mcsm_data
                   , by = 'Mutationinformation'
                   , all.x = T) 
 #!=!=!=!=!=!=!=!
 # COMMENT: used all.y since position 186 is not part of the struc,
 # hence doesn't have a mcsm value
 # but 186 is associated with mutation
 #!=!=!=!=!=!=!=!
 # should  be False
 identical(merged_df2, merged_df2v2)
 table(merged_df2$Position%in%merged_df2v2$Position)
 rm(merged_df2v2)
 #########
 # b) merged_df3:remove duplicate mutation information
 #########
 cat('Merging dfs without NAs: small df (removing muts with no AF|OR associated)'
    ,'\nCannot trust lineage info from this'
    ,'\nlinking col: Mutationinforamtion'
    ,'\nfilename: merged_df3')
 #==#=#=#=#=#=#
 # Cannot trust lineage, country from this df as the same mutation
 # can have many different lineages
 # but this should be good for the numerical corr plots
 #=#=#=#=#=#=#=
 merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),] 
 head(merged_df3$Position); tail(merged_df3$Position) # should be sorted
 # sanity check
 cat('Checking nrows in merged_df3')
 if(nrow(mcsm_data) == nrow(merged_df3)){
  cat('PASS: No. of rows match with mcsm_data'
      ,'\nExpected no. of rows: ', nrow(mcsm_data)
      ,'\nGot no. of rows: ', nrow(merged_df3))
 } else {
  cat('FAIL: No. of rows mismatch'
      , '\nNo. of rows mcsm_data: ', nrow(mcsm_data)
      , '\nNo. of rows merged_df3: ', nrow(merged_df3))
 }
 # counting NAs in AF, OR cols in merged_df3
 # this is becuase mcsm has no AF, OR cols,
 # so you cannot count NAs
 if (identical(sum(is.na(merged_df3$OR))
              , sum(is.na(merged_df3$pvalue))
              , sum(is.na(merged_df3$AF)))){
  cat('PASS: NA count match for OR, pvalue and AF\n')
  na_count_df3 = sum(is.na(merged_df3$AF))
  cat('No. of NAs: ', sum(is.na(merged_df3$OR)))
 } else{
  cat('FAIL: NA count mismatch'
      , '\nNA in OR: ', sum(is.na(merged_df3$OR))
      , '\nNA in pvalue: ', sum(is.na(merged_df3$pvalue))
      , '\nNA in AF:', sum(is.na(merged_df3$AF)))
 }
 #=======================================================================
 #%% merging without NAs
 cat('Begin merging dfs without NAs'
 	, '\n===============================================================')
 cat('Merging dfs without any NAs: big df (1-many relationship b/w id & mut)'
    ,'\nlinking col: Mutationinforamtion'
    ,'\nfilename: merged_df2_comp')
 #########
 # c) merged_df2_comp: same as merge 1 but excluding NA
 #########
 merged_df2_comp = merged_df2[!is.na(merged_df2$AF),] 
 #merged_df2_comp = merged_df2[!duplicated(merged_df2$Mutationinformation),] 
 # sanity check
 cat('Checking nrows in merged_df2_comp')
 if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count + 1)){
  cat('PASS: No. of rows match'
      ,'\nDim of merged_df2_comp: '
      ,'\nExpected no. of rows: ', nrow(merged_df2) - na_count + 1
      , '\nNo. of rows: ', nrow(merged_df2_comp)
      , '\nNo. of cols: ', ncol(merged_df2_comp))
 }else{
  cat('FAIL: No. of rows mismatch'
      ,'\nExpected no. of rows: ', nrow(merged_df2) - na_count + 1
      ,'\nGot no. of rows: ', nrow(merged_df2_comp))
 }
 #########
 # d) merged_df3_comp: remove duplicate mutation information
 #########
 merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),] 
 cat('Dim of merged_df3_comp: '
    , '\nNo. of rows: ', nrow(merged_df3_comp)
    , '\nNo. of cols: ', ncol(merged_df3_comp))
 # alternate way of deriving merged_df3_comp
 foo = merged_df3[!is.na(merged_df3$AF),]
 # compare dfs: foo and merged_df3_com
 all.equal(foo, merged_df3)
 summary(comparedf(foo, merged_df3))
 # sanity check
 cat('Checking nrows in merged_df3_comp')
 if(nrow(merged_df3_comp) == nrow(merged_df3)){
  cat('NO NAs detected in merged_df3 in AF|OR cols'
      ,'\nNo. of rows are identical: ', nrow(merged_df3))
 } else{
  if(nrow(merged_df3_comp) == nrow(merged_df3) - na_count_df3) {
  cat('PASS: NAs detected in merged_df3 in AF|OR cols'
      , '\nNo. of NAs: ', na_count_df3
      , '\nExpected no. of rows in merged_df3_comp: ', nrow(merged_df3) - na_count_df3
      , '\nGot no. of rows: ', nrow(merged_df3_comp))
  }
 }
 #======================================================================= 
 #*********************
 # writing 1 file in the style of a loop: merged_df3
 # print(output dir)
 #i = 'merged_df3'
 #out_filename = paste0(i, '.csv')
 #outfile = paste0(outdir, '/', out_filename)
 #cat('Writing output file: '
 #    ,'\nFilename: ', out_filename
 #    ,'\nPath: ', outdir)
 #template: write.csv(merged_df3, 'merged_df3.csv')
 #write.csv(get(i), outfile, row.names = FALSE)
 #cat('Finished writing: ', outfile
 #    , '\nNo. of rows: ', nrow(get(i))
 #    , '\nNo. of cols: ', ncol(get(i)))
 #%% write_output files;  all 4 files: 
 outvars = c('merged_df2'
             , 'merged_df3'
             , 'merged_df2_comp'
             , 'merged_df3_comp')
 cat('Writing output files: '
    , '\nPath:', outdir)
 for (i in outvars){
 #  cat(i, '\n')
  out_filename = paste0(i, '.csv')
 #  cat(out_filename, '\n')
 #  cat('getting value of variable: ', get(i))
  outfile = paste0(outdir, '/', out_filename)
 #  cat('Full output path: ', outfile, '\n')
  cat('Writing output file:'
      ,'\nFilename: ', out_filename,'\n')
  write.csv(get(i), outfile, row.names = FALSE)
  cat('Finished writing: ', outfile
      , '\nNo. of rows: ', nrow(get(i))
      , '\nNo. of cols: ', ncol(get(i)), '\n')
 }
 # alternate way to replace with implicit loop 
 # FIXME
 #sapply(outvars, function(x, y) write.csv(get(outvars), paste0(outdir, '/', outvars, '.csv')))
 #======================================================================= 
 #%% merging mut_pos_occurrence and sample_pos_occurence
 # FIXME
 #cat('Merging dfs with positional frequency from mcsm and meta_data'
 #    , '\nNcol in mut_pos_occurrence:', ncol(mut_pos_occurrence)
 #    , '\nncol in sample_pos_occurence:', ncol(sample_pos_occurrence)
 #    ,'\nlinking col:', intersect(colnames(sample_pos_occurrence), colnames(mut_pos_occurrence))
 #    ,'\nfilename: merged_df4')
 #merged_df4 = merge(sample_pos_occurrence, mut_pos_occurrence
 #                   , by = 'position'
 #                   , all = T)
 #out_filename4 = 'mut_and_sample_freq.csv'
 #outfile4 = paste0(outdir, '/', out_filename4)
 #*************************
 # clear variables
 rm(mcsm_data, meta_with_afor, foo, drug, gene, gene_match, indir, merged_muts_u, meta_muts_u, na_count, orig_col, outdir)
 rm(mut_pos_occurrence, sample_pos_occurrence)
 #rm(merged_df4)
 #%% end of script
 #=======================================================================
--- a/meta_data_analysis/dssp_df.py
+++ b/meta_data_analysis/dssp_df.py
@ -1,170 +0,0 @@
 #!/home/tanu/anaconda3/envs/ContactMap/bin/python3
 # -*- coding: utf-8 -*-
 """
 Created on Tue Feb 18 10:10:12 2020
@author: tanu
 """
 #=======================================================================
 # Task: Read a DSSP file into a data frame and output to a csv file
 # Input: '.dssp' i.e gene associated.dssp file (output from run_dssp.sh)
 # Output: '.csv' file containing DSSP output as a df with ASA, RSA, etc.
 	# based on Tien at al 2013 (theor.) values
 # useful links:
 # https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html
 # https://en.wikipedia.org/wiki/Relative_accessible_surface_area
 #=======================================================================
 #%% load packages
 import sys, os
 import re
 import pandas as pd
 from Bio.PDB import PDBParser
 from Bio.PDB.DSSP import DSSP
 import pprint as pp
 import dms_tools2
 import dms_tools2.dssp
 #=======================================================================#
 #%% specify homedir and curr dir
 homedir = os.path.expanduser('~') 
 # set working dir
 os.getcwd()
 os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
 os.getcwd()
 #=======================================================================
 #%% variable assignment: input and output 
 #drug = 'pyrazinamide'
 #gene = 'pncA'
 #gene_match = gene + '_p.'
 drug = 'cycloserine'
 gene = 'alr'
 #==========
 # data dir
 #==========
 #indir = 'git/Data/pyrazinamide/input/original'
 datadir = homedir + '/' + 'git/Data'
 #=======
 # input from outdir
 #=======
 indir = datadir + '/' + drug + '/' + 'input'
 #1) pdb file
 in_filename_pdb = gene.lower() + '_complex' + '.pdb'
 infile_pdb = indir + '/' + in_filename_pdb
 print('Input pdb filename:', in_filename_pdb
      , '\npath:', indir
      , '\n============================================================')
 #2) dssp file
 outdir = datadir + '/' + drug + '/' + 'output'
 in_filename = gene.lower() +'.dssp'
 infile = outdir + '/' + in_filename
 print('Input dssp filename:', in_filename
      , '\npath:', outdir
      , '\n============================================================')
 #=======
 # output 
 #=======
 outdir = datadir + '/' + drug + '/' + 'output'
 out_filename = gene.lower() + '_dssp.csv'
 outfile =  outdir + '/' + out_filename
 print('Output filename:', out_filename
      , '\npath:', outdir
      , '\nOutfile: ', outfile
      , '\n=============================================================')
 #%% end of variable assignment for input and output files
 #=======================================================================
 #%% specify pdb chain as a list. Handy when more than 1 pdb chain
 my_chains = ['A']
 # my_chains = ['A', 'B'] # for cycloserine
 # generate my_chains from dssp
 p = PDBParser()
 structure = p.get_structure(in_filename_pdb, infile_pdb)
 model = structure[0]
 dssp = DSSP(model, infile_pdb)
 #print(dssp.keys())
 #print(dssp.keys()[0][0])
 #print(len(dssp))
 #print(dssp.keys()[0][0])
 #print(dssp.keys()[len(dssp)-1][0])
 dssp_chains = []
 for num_aa in range(0, len(dssp)):
 #    print(num_aa)
 #   extract the chain id only and append to a list
    dssp_chains.append(dssp.keys()[num_aa][0])
    chainsL = list(set(dssp_chains))
 print(chainsL)
 # sort the list to make for sanity (since sets are not ordered)
 # this will be required for dssp_df
 my_chains = sorted(chainsL)
 print('dssp output for'
      , in_filename, 'contains:', len(my_chains)
      , 'chains:\n', my_chains)
 #%% ====================================================================
 # Process dssp output and extract into df (single chain)
 #dssp_df = dms_tools2.dssp.processDSSP(infile, chain = my_chains)
 #dssp_df['chain_dssp'] = my_chains
 #pp.pprint(dssp_df)
 #=======================================================================
 # incase pdb has > 1 chain and you need to run it for all chains
 # initialise an empty df
 dssp_df =  pd.DataFrame()
 print('Total no. of chains: ', len(my_chains))
 for chain_id in my_chains:
    print('Chain id:', chain_id)
    dssp_cur = pd.DataFrame()
    dssp_cur = dms_tools2.dssp.processDSSP(infile, chain = chain_id)
    #!!!Important!!!
    dssp_cur['chain_id'] = chain_id 
    dssp_df = dssp_df.append(dssp_cur)
    pp.pprint(dssp_df)
 #=====================
 # Renaming amino-acid
 # and site columns
 #=====================
 # Rename column (amino acid) as 'wild_type' and (site} as 'position' 
 # to be the same names as used in the file required for merging later.
 dssp_df.columns
 dssp_df.rename(columns = {'site':'position', 'amino_acid':'wild_type_dssp'}, inplace = True)
 dssp_df.columns
 # sanity check
 if len(dssp_df) == len(dssp):
    print('PASS: length of dssp_df has correct length')
 else:
    print('FAIL: length mismatch for dssp_df'
          , '\nexpected length:', len(dssp)
          , '\nGot length:', len(dssp_df)
          , 'Debug please!')
 #%% Write ouput csv file
 print('Writing file:', outfile
      , '\nFilename:', out_filename
      , '\nPath:',  outdir
      , '\n=============================================================')
 # write to csv
 dssp_df.to_csv(outfile, header=True, index = False)
 print('Finished writing:', out_filename
     , '\nNo. of rows:', len(dssp_df)
     , '\nNo. of cols:', len(dssp_df.columns)
     , '\n==============================================================')
 #%% end of script
 #=======================================================================
 #%% run dssp to extract chain ids to later process the dssp output into a df
--- a/meta_data_analysis/init_data_dirs.py
+++ b/meta_data_analysis/init_data_dirs.py
@ -1,36 +0,0 @@
 #!/usr/bin/python3
 # Initialise a blank 'Data' directory and drug subdirs etc.
 # TODO:
 # - Read base dir from config file
 # - Create eg: '~/git/Data/{original,processed}
 # - Create eg: '~/git/Data/processed/' + drug (for each drug)
 # - Create eg: '~/git/Data/output/' + drug + '{plots, structure}'
 #%% specify homedir as python doesn't recognise tilde
 homedir = os.path.expanduser('~') 
 #%% variable assignment: input and output paths & filenames
 drug = 'pyrazinamide'
 gene = 'pncA'
 gene_match = gene + '_p.'
 #==========
 # data dir
 #==========
 #indir = 'git/Data/pyrazinamide/input/original'
 datadir = homedir + '/' + 'git/Data'
 #==========
 # input dir
 #==========
 indir = datadir + '/' + drug + '/' + 'input'
 #============
 # output dir
 #============
 # several output files
 outdir = datadir + '/' + drug + '/' + 'output'
 #%%end of variable assignment for input and output files
 #==============================================================================
--- a/meta_data_analysis/kd_df.py
+++ b/meta_data_analysis/kd_df.py
@ -1,193 +0,0 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 '''
 Created on Tue Aug  6 12:56:03 2019
@author: tanu
 '''
 #=======================================================================
 # Task: Hydrophobicity (Kd) values for amino acid sequence using the
 # Kyt&-Doolittle.
 # Same output as using the expasy server https://web.expasy.org/protscale/
 # Input: fasta file
 # Output: csv file with 
 # useful links
 # https://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParamData-pysrc.html
 # https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html
 #=======================================================================
 #%% load packages
 from pylab import *
 from Bio.SeqUtils import ProtParamData
 from Bio.SeqUtils.ProtParam import ProteinAnalysis
 from Bio import SeqIO
 #from Bio.Alphabet.IUPAC import IUPACProtein
 #import pprint as pp
 import pandas as pd
 import numpy as np
 import sys, os
 #=======================================================================
 #%% specify input and curr dir
 homedir = os.path.expanduser('~')
 # set working dir
 os.getcwd()
 os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
 os.getcwd()
 #=======================================================================
 #%% variable assignment: input and output 
 drug = 'pyrazinamide'
 gene = 'pncA'
 gene_match = gene + '_p.'
 #==========
 # data dir
 #==========
 #indir = 'git/Data/pyrazinamide/input/original'
 datadir = homedir + '/' + 'git/Data'
 #=======
 # input
 #=======
 #indir = 'git/Data/pyrazinamide/input/original'
 indir = datadir + '/' + drug + '/' + 'input'
 in_filename = '3pl1.fasta.txt'
 infile = indir + '/' + in_filename
 print('Input filename:', in_filename
      , '\nInput path:', indir
      , '\n============================================================')
 #=======
 # output 
 #=======
 outdir =   datadir + '/' + drug + '/' + 'output'
 out_filename = gene.lower() + '_kd.csv'
 outfile =  outdir + '/' + out_filename
 print('Output filename:', out_filename
      , '\nOutput path:', outdir
      , '\n=============================================================')
 #%% end of variable assignment for input and output files
 #=======================================================================
 #===================
 #calculate KD values: same as the expasy server
 #===================
 #%%specify window size for hydropathy profile computation
 # https://web.expasy.org/protscale/pscale/protscale_help.html
 my_window = 3
 offset = round((my_window/2)-0.5)
 fh = open(infile)
 for record in SeqIO.parse(fh, 'fasta'):
    id = record.id
    seq = record.seq
    num_residues = len(seq)
 fh.close()
 sequence = str(seq)
 X = ProteinAnalysis(sequence)
 kd_values = (X.protein_scale(ProtParamData.kd , window = my_window)) # edge weight is set to  default (100%)
 # sanity checks 
 print('Sequence Length:', num_residues)
 print('kd_values Length:',len(kd_values))
 print('Window Length:', my_window)
 print('Window Offset:', offset)
 print('=================================================================')
 print('Checking:len(kd values) is as expected for the given window size & offset...')
 expected_length =  num_residues - (my_window - offset) 
 if len(kd_values) == expected_length:
    print('PASS: expected and actual length of kd values match')
 else:
    print('FAIL: length mismatch'
          ,'\nExpected length:', expected_length
          ,'\nActual length:', len(kd_values)
          , '\n=========================================================')
 #===================
 # creating two dfs
 #===================
 #%% make 2 dfs; 1) aa sequence and 2) kd_values. Then reset index for each df 
 # which will allow easy merging of the two dfs.
 # df1: df of aa seq with index reset to start from 1 (reflective of the actual aa position in a sequence)
 # Name column of wt as 'wild_type' to be the same name used in the file required for merging later.
 dfSeq = pd.DataFrame({'wild_type_kd':list(sequence)})
 dfSeq.index = np.arange(1, len(dfSeq) + 1) # python is not inclusive
 # df2: df of kd_values with index reset to start from offset + 1 and subsequent matched length of the kd_values
 dfVals = pd.DataFrame({'kd_values':kd_values})
 dfVals.index = np.arange(offset + 1, len(dfVals) + 1 + offset)
 # sanity checks
 max(dfVals['kd_values'])
 min(dfVals['kd_values'])
 #===================
 # concatenating dfs
 #===================
 # Merge the two on index 
 # (as these are now reflective of the aa position numbers): df1 and df2 
 # This will introduce NaN where there is missing values. In our case this
 # will be 2 (first and last ones based on window size and offset)
 # In our case this will be 2 (first and last ones)
 # For pnca: the last position is not part of the struc, so not info loss
 # Needless to say that this will be variable for other targets.
 kd_df = pd.concat([dfSeq, dfVals], axis = 1)
 #============================
 # renaming index to position
 #============================
 kd_df = kd_df.rename_axis('position')
 kd_df.head
 print('=================================================================')
 print('position col i.e. index should be numeric
 	, '\n===============================================================')
 if kd_df.index.dtype == 'int64':
    print('PASS: position col is numeric'
          , '\ndtype is:', kd_df.index.dtype)
 else:
    print('FAIL: position col is not numeric'
          , '\nConverting to numeric')
    kd_df.index.astype('int64')
    print('Checking dtype for after conversion:\n'
          , '\ndtype is:', kd_df.index.dtype
          , '\n=========================================================')
 #===============
 # writing file
 #===============
 print('Writing file:', out_filename
      , '\nFilename:', out_filename
      , '\nPath:',  outdir
      , '\n=============================================================')
 kd_df.to_csv(outfile, header = True, index = True)
 print('Finished writing:', out_filename
      , '\nNo. of rows:', len(kd_df)
      , '\nNo. of cols:', len(kd_df.columns)
      , '\n=============================================================')
 #===============
 # plot: optional!
 #===============#%% plot
 # http://www.dalkescientific.com/writings/NBN/plotting.html
 # FIXME: save fig
 # extract just pdb if from 'id' to pass to title of plot
 # foo = re.match(r'(^[0-9]{1}\w{3})', id).groups(1)
 plot(kd_values, linewidth = 1.0)
 #axis(xmin = 1, xmax = num_residues)
 xlabel('Residue Number')
 ylabel('Hydrophobicity')
 title('K&D Hydrophobicity for ' + id)
 show()
 print('======================================================================')
 #%% end of script
 #=======================================================================
--- a/meta_data_analysis/mut_electrostatic_changes.py
+++ b/meta_data_analysis/mut_electrostatic_changes.py
@ -1,167 +0,0 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 '''
 Created on Tue Aug  6 12:56:03 2019
@author: tanu
 '''
 # FIXME: import dirs.py to get the basic dir paths available
 #=======================================================================
 # TASK: calculate how many mutations result in 
 # electrostatic changes wrt wt
 # Input: mcsm and AF_OR file
 # Output: mut_elec_changes_results.txt 
 #=======================================================================
 #%% load libraries
 import os, sys
 import pandas as pd
 #import numpy as np
 #=======================================================================
 #%% specify homedir and curr dir
 homedir = os.path.expanduser('~') 
 # set working dir
 os.getcwd()
 os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
 os.getcwd()
 #=======================================================================
 #%% variable assignment: input and output 
 drug = 'pyrazinamide'
 gene = 'pncA'
 gene_match = gene + '_p.'
 #==========
 # data dir
 #==========
 #indir = 'git/Data/pyrazinamide/input/original'
 datadir = homedir + '/' + 'git/Data'
 #=======
 # input
 #=======
 indir = datadir + '/' + drug + '/' + 'input'
 in_filename  = 'merged_df3.csv'
 infile = outdir + '/' + in_filename
 print('Input filename: ', in_filename
      , '\nInput path: ', indir
      , '\n============================================================')
 #=======
 # output 
 #=======
 outdir = datadir + '/' + drug + '/' + 'output'
 # specify output file 
 out_filename = 'mut_elec_changes.txt'
 outfile =  outdir + '/' + out_filename
 print('Output filename: ', out_filename
      , '\nOutput path: ', outdir
      , '\n============================================================')
 #%% end of variable assignment for input and output files
 #=======================================================================
 #%% Read input files
 print('Reading input file (merged file):', infile)
 comb_df = pd.read_csv(infile, sep = ',')  
 print('Input filename: ', in_filename
      , '\nPath :', outdir
      , '\nNo. of rows: ', len(comb_df)
      , '\nNo. of cols: ', infile
      , '\n============================================================')
 # column names
 list(comb_df.columns)
 # clear variables
 del(in_filename, infile)
 #%% subset unique mutations
 df = comb_df.drop_duplicates(['Mutationinformation'], keep = 'first')
 total_muts = df.Mutationinformation.nunique()
 #df.Mutationinformation.count()
 print('Total mutations associated with structure: ', total_muts
 	, '\n===============================================================')
 #%% combine aa_calcprop cols so that you can count the changes as value_counts
 # check if all muts have been categorised
 print('Checking if all muts have been categorised: ')
 if df['wt_calcprop'].isna().sum() == 0 & df['mut_calcprop'].isna().sum():
    print('PASS: No. NA detected i.e all muts have aa prop associated'
    , '\n===============================================================')
 else:
    print('FAIL: NAs detected i.e some muts remain unclassified'
    , '\n===============================================================')
 df['wt_calcprop'].head()
 df['mut_calcprop'].head()
 print('Combining wt_calcprop and mut_calcprop...')
 #df['aa_calcprop_combined'] = df['wt_calcprop']+ '->' + df['mut_calcprop']
 df['aa_calcprop_combined'] = df.wt_calcprop.str.cat(df.mut_calcprop, sep = '->')
 df['aa_calcprop_combined'].head()
 mut_categ = df["aa_calcprop_combined"].unique()
 print('Total no. of aa_calc properties: ', len(mut_categ))
 print('Categories are: ', mut_categ)
 # counting no. of muts in each mut categ
 # way1: count values within each combinaton
 df.groupby('aa_calcprop_combined').size()
 #df.groupby('aa_calcprop_combined').count()
 # way2: count values within each combinaton
 df['aa_calcprop_combined'].value_counts()
 # comment: the two ways should be identical
 # groupby result order is similar to pivot table order,
 # I prefer the value_counts look
 # assign to variable: count values within each combinaton
 all_prop = df['aa_calcprop_combined'].value_counts()
 # convert to a df from Series
 ap_df = pd.DataFrame({'aa_calcprop': all_prop.index, 'mut_count': all_prop.values})
 # subset df to contain only the changes in prop
 all_prop_change = ap_df[ap_df['aa_calcprop'].isin(['neg->neg','non-polar->non-polar','polar->polar', 'pos->pos']) == False]
 elec_count = all_prop_change.mut_count.sum()
 print('Total no.of muts with elec changes: ', elec_count)
 # calculate percentage of electrostatic changes
 elec_changes = (elec_count/total_muts) * 100
 print('Total number of electrostatic changes resulting from Mutation is (%):', elec_changes)
 # check no change muts
 no_change_muts = ap_df[ap_df['aa_calcprop'].isin(['neg->neg','non-polar->non-polar','polar->polar', 'pos->pos']) == True]
 no_change_muts.mut_count.sum()
 #%% output from console
 #sys.stdout = open(file, 'w')
 sys.stdout = open(outfile, 'w')
 print('======================\n'
      ,'Unchanged muts'
      ,'\n=====================\n'
      , no_change_muts
      ,'\n=============================\n'      
      , 'Muts with changed prop:'
      , '\n============================\n'
      , all_prop_change)
 print('================================================================='                
 , '\nTotal number of electrostatic changes resulting from Mtation is (%):', elec_changes
 , '\nTotal no. of muts: ', total_muts
 , '\nTotal no. of changed muts: ', all_prop_change.mut_count.sum()
 , '\nTotal no. of unchanged muts: ', no_change_muts.mut_count.sum() 
 , '\n===================================================================')  
 #%% end of script
 #=======================================================================
--- a/meta_data_analysis/mutate.py
+++ b/meta_data_analysis/mutate.py
@ -1,179 +0,0 @@
 #!/usr/bin/python
 from __future__ import print_function
 from Bio import SeqIO
 from Bio.Seq import Seq
 from collections import OrderedDict
 import sys
 import argparse
 # https://github.com/jrjhealey/bioinfo-tools/blob/master/Mutate.py
 # https://www.biostars.org/p/336891/
 # TODO:
 #  - create some logic to 'group' mutations that will be applied to the same sequence, to
 #    make all switches at once
 #    - This will also probably break the verbose transversion output so the maths will need replacing
 #  - Create the ability to support INDELS (will also require pairwise alignment so that
 #    hamming distances remain meaningful.
 def get_args():
    """Parse command line arguments"""
    desc = "Mutate fasta sequences based on a file of sequence mappings."
    epi = (
        "This script takes a mapfile of the form:\n"
        " SequenceID,A123B\n"
        " SequenceID,X456Y\n"
        "And performs substitutions/mutations. At preset it only does one SNP per sequence.\n"
    )
    try:
        parser = argparse.ArgumentParser(
            description=desc, epilog=epi, formatter_class=argparse.RawTextHelpFormatter
        )
        parser.add_argument(
            "mutation_file",
            action="store",
            help='File of mutation mappings like so: "SeqID,X123Y"',
        )
        parser.add_argument(
            "sequences",
            action="store",
            help="File of sequences to be mutated (fasta only).",
        )
        parser.add_argument(
            "-v",
            "--verbose",
            action="store_true",
            help="Verbose behaviour, printing parameters of the script.",
        )
        parser.add_argument(
            "-o",
            "--outfile",
            action="store",
            help="Output file for mutated sequences (default STDOUT).",
        )
        if len(sys.argv) == 1:
            parser.print_help(sys.stderr)
            exit(1)
    except:
        sys.stderr.write(
            "An exception occurred with argument parsing. Check your provided options.\n"
        )
    return parser.parse_args()
 class Mutation(object):
    """A class wrapper for sequence IDs so that duplicate IDs can be used in a dictionary"""
    def __init__(self, name):
        self.name = name
    def __repr__(self):
        return "'" + self.name + "'"
    def __str__(self):
        return self.name
 def parse_mapfile(mapfile):
    """Return a dict of mapped mutations.
    File should resemble:
     SequenceID,A123B
     SequenceID2,X234Y
    Sequence IDs should exactly match the fasta headers, as parsed by BioPython.
    (">" symbols are optional)
    """
    with open(mapfile, "r") as handle:
        mut_dict = OrderedDict()
        for line in handle:
            id, change = line.lstrip(">").rstrip("\n").split(",")
            mut_dict[Mutation(id)] = change
    for k, v in mut_dict.items():
        assert v[0].isalpha(), (
            "First character of mutation map is not a valid letter. Got: %s" % v[0]
        )
        assert v[-1].isalpha(), (
            "Last character of mutation map is not a valid letter. Got: %s" % v[-1]
        )
        assert v[1:-1].isdigit(), (
            "Location string of mutation map is not a valid number. Got: %s" % v[1:-1]
        )
    return mut_dict
 def morph(orig, loc, new, mutableseq, verbose):
    """Perform actual sequence change (polymorphism only at present)"""
    # Shift location to offset 0-based index
    loc = loc - 1
    assert mutableseq[loc] == orig, (
        "Sequence does not match the mutation file for pre-exising residue. Expected %s , got %s "
        % (orig, mutableseq[loc])
    )
    if verbose is True:
        print(
            "Performing change: {} -> {}, at location: {} (0 based)".format(
                orig, new, loc
            )
        )
    mutableseq[loc] = new
    return mutableseq
 def hamming_distance(s1, s2):
    """Return the Hamming distance between equal-length sequences"""
    if len(s1) != len(s2):
        raise ValueError("Undefined for sequences of unequal length")
    return sum(ch1 != ch2 for ch1, ch2 in zip(s1.upper(), s2.upper()))
 def main():
    args = get_args()
    if args.outfile is not None:
        ofh = open(args.outfile, "w")
    # Parse the mutation file (get mutations by sequence)
    mutations = parse_mapfile(args.mutation_file)
    if args.verbose is True:
        print("Got mutations:")
        print(mutations)
    # Iterate all sequences and make any substitutions necessary
    for record in SeqIO.parse(args.sequences, "fasta"):
        for k, v in mutations.items():
            mutable = record.seq.upper().tomutable()
 #            mutable = record.seq.tomutable()
 #            print("MO:", mutable)
            if k.name == record.id[0:4]: # BEWARE HARDCODING
 #                print("k.name:", k.name, "record.id:", record.id)
                orig = v[0]
                print("orig:", orig)
                new = v[-1]
                loc = int(v[1:-1])
                if args.verbose:
                    print(record.id)
                newseq = morph(orig, loc, new, mutable, args.verbose)
 #                print("NS is:", newseq)
                if args.verbose is True:
                    print("Original: " + record.seq.upper())
                    print(
                        str((" " * int(loc - 2 + 11))) + "V"
                    )  # Padded string to show where switch happened (not sure how it'll deal with line wrapping
                    print("New:      " + newseq)
                    print(
                        "Distance: "
                        + str(hamming_distance(str(record.seq), str(newseq)))
                    )
                if args.outfile is not None:
                    ofh.write(">%s_%s\n%s\n" % (record.id, v, newseq))
                if args.verbose is False:
                    print(">%s_%s\n%s\n" % (record.id, v, newseq))
    if args.outfile is not None:
        ofh.close()
 if __name__ == "__main__":
    main()
--- a/meta_data_analysis/rd_df.py
+++ b/meta_data_analysis/rd_df.py
@ -1,135 +0,0 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 '''
 Created on Tue Aug  6 12:56:03 2019
@author: tanu
 '''
 #=============================================================================
 # Task: Residue depth (rd) processing to generate a df with residue_depth(rd)
 # values
 # FIXME
 # Input: '.tsv' i.e residue depth txt file (output from .zip file manually
 # downloaded from the website). 
 # This should be integrated into the pipeline
 # Output: .csv with 3 cols i.e position, rd_values & 3-letter wt aa code(caps)
 #=============================================================================
 #%% load packages
 import sys, os
 import pandas as pd
 #import numpy as np
 #=============================================================================
 #%% specify input and curr dir
 homedir = os.path.expanduser('~')
 # set working dir
 os.getcwd()
 os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
 os.getcwd()
 #=============================================================================
 #%% variable assignment: input and output 
 drug = 'pyrazinamide'
 gene = 'pncA'
 gene_match = gene + '_p.'
 #==========
 # data dir
 #==========
 #indir = 'git/Data/pyrazinamide/input/original'
 datadir = homedir + '/' + 'git/Data'
 #=======
 # input
 #=======
 #indir = 'git/Data/pyrazinamide/input/original'
 indir = datadir + '/' + drug + '/' + 'output'
 in_filename = '3pl1_rd.tsv'
 infile = indir + '/' + in_filename
 print('Input filename:', in_filename
      , '\nInput path:', indir
      , '\n=============================================================')
 #=======
 # output 
 #=======
 outdir =   datadir + '/' + drug + '/' + 'output'
 out_filename = gene.lower() + '_rd.csv'
 outfile =  outdir + '/' + out_filename
 print('Output filename:', out_filename
      , '\nOutput path:', outdir
      , '\n=============================================================')
 #%% end of variable assignment for input and output files
 #=======================================================================
 #%%  Read input file
 rd_data  = pd.read_csv(infile, sep = '\t')  
 print('Reading input file:', infile
      , '\nNo. of rows:', len(rd_data)
      , '\nNo. of cols:', len(rd_data.columns))
 print('Column names:', rd_data.columns
 	, '\n===============================================================')
 #========================
 # creating position col
 #========================
 # Extracting residue number from index and assigning 
 # the values to a column [position]. Then convert the position col to numeric.
 rd_data['position'] = rd_data.index.str.extract('([0-9]+)').values
 # converting position to numeric      
 rd_data['position'] = pd.to_numeric(rd_data['position'])
 rd_data['position'].dtype
 print('Extracted residue num from index and assigned as a column:'
      , '\ncolumn name: position'
      , '\ntotal no. of cols now:', len(rd_data.columns)
      , '\n=============================================================')
 #========================
 # Renaming amino-acid 
 # and all-atom cols
 #========================
 print('Renaming columns:'
      , '\ncolname==> # chain:residue: wt_3letter_caps'
      , '\nYES... the column name *actually* contains a # ..!'
      , '\ncolname==> all-atom: rd_values'
      , '\n=============================================================')
 rd_data.rename(columns = {'# chain:residue':'wt_3letter_caps', 'all-atom':'rd_values'}, inplace = True)
 print('Column names:', rd_data.columns)
 #========================
 # extracting df with the
 # desired columns
 #========================
 print('Extracting relevant columns for writing df as csv')
 rd_df = rd_data[['position','rd_values','wt_3letter_caps']]
 if len(rd_df) == len(rd_data):
    print('PASS: extracted df has expected no. of rows'
          ,'\nExtracted df dim:'
          ,'\nNo. of rows:', len(rd_df)
          ,'\nNo. of cols:', len(rd_df.columns))
 else:
    print('FAIL: no. of rows mimatch'
          , '\nExpected no. of rows:', len(rd_data)
          , '\nGot no. of rows:', len(rd_df)
          , '\n=========================================================')
 #%% write file
 print('Writing file:'
      , '\nFilename:', out_filename
      , '\nPath:',  outdir
      , '\n=============================================================')
 rd_df.to_csv(outfile, header = True, index = False)
 print('Finished writing:', out_filename
      , '\nNo. of rows:', len(rd_df)
      , '\nNo. of cols:', len(rd_df.columns)
      , '\n=============================================================')
 #%% end of script
 #=======================================================================
--- a/meta_data_analysis/run_mutate.sh
+++ b/meta_data_analysis/run_mutate.sh
@ -1,18 +0,0 @@
 #!/bin/bash
 #https://www.biostars.org/p/336891/
 #python Mutate.py -v -o /path/to/output.fasta mutation_map_file.csv input.fasta
 # pnca_all_muts_msa_FIXME: This should be formatted like this from python script
 # change to a cmd script that takes this "prefix" as an input
 for i in $(cat pnca_all_muts_msa_FIXME.csv); do echo "3PL1,${i}"; done > pnca_copy.txt
 # make sure there is no new line at the end of the mutation file (snps.csv)
 #python3 Mutate.py -v -o /home/tanu/git/Data/pyrazinamide/input/output.fasta mut_map.csv 3pl1.fasta.txt
 python3 mutate.py -v -o /home/tanu/git/Data/pyrazinamide/output/pnca_msa.txt /home/tanu/git/Data/pyrazinamide/output/pnca_all_muts_msa.csv /home/tanu/git/Data/pyrazinamide/input/pnca_fasta.txt
 # remove fasta style header lines in the output i.e
 # lines beginning with '>' so the file is just the mutated seqs
 sed -i '/^>.*$/d' /home/tanu/git/Data/pyrazinamide/output/pnca_msa.txt
 printf 'No. of lines after cleaning: '
 cat /home/tanu/git/Data/pyrazinamide/output/pnca_msa.txt | wc -l
--- a/meta_data_analysis/run_pdb_dssp.sh
+++ b/meta_data_analysis/run_pdb_dssp.sh
@ -1,54 +0,0 @@
 #!/bin/bash
 #=======================================================================
 # Task: read a pdb file and generate a dssp output file
 # Input:
 # pdb file
 # Output:
 # pdb_code.dssp
 # needs dssp exe on linux
 # more efficient to run dssp exe locally
 # note: double quotes for variable interpolation
 #=======================================================================
 # #%%specify variables for input and output paths and filenames
 drug='pyrazinamide'
 gene='pncA'
 #convert to lowercase for consistency in filenames
 gene_l=$(printf $gene | awk '{print tolower($0)}') 
 gene_match="${gene}_p."
 #printf "${gene_match}\n"
 #==========
 # data dir
 #==========
 #indir = 'git/Data/pyrazinamide/input/original'
 #datadir="~git/Data"
 datadir="${HOME}/git/Data"
 #=======
 # input
 #=======
 indir=${datadir}'/'${drug}'/input'
 printf "Input dir: ${indir}\n"
 in_filename='3pl1.pdb'
 #infile=${basedir}${inpath}${in_filename}
 infile=${indir}'/'${in_filename}
 printf "Input file: ${infile}\n"
 #=======
 # output 
 #=======
 outdir=${datadir}'/'${drug}$'/output'
 printf "Output dir: ${outdir}"
 #out_filename='3pl1.dssp'
 out_filename="${gene_l}.dssp"
 outfile=${outdir}'/'${out_filename}
 printf "Output file: ${outfile}\n"
 #%%end of variable assignment for input and output files
 #================================================================
 # command line arg to run dssp and create output file
 dssp -i ${infile} -o ${outfile}
 printf "Finished writing: ${outfile}\n"
 #printf "Filename: ${out_filename}\nlocated in: ${outdir}\n"
--- a/plotting_test/logo_plot.R
+++ b/plotting_test/logo_plot.R
@ -1,221 +0,0 @@
 #=======================================================================
 # Task: To generate a logo plot  or bar plot but coloured 
 # aa properties.
 # step1: read mcsm file and OR file
 # step2: plot wild type positions
 # step3: plot mutants per position coloured by aa properties
 # step4: make the size of the letters/bars prop to OR if you can!
 # useful links
 # https://stackoverflow.com/questions/5438474/plotting-a-sequence-logo-using-ggplot2
 # https://omarwagih.github.io/ggseqlogo/
 # https://kkdey.github.io/Logolas-pages/workflow.html
 # A new sequence logo plot to highlight enrichment and depletion.
 #    https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6288878/
 #very good: http://www.cbs.dtu.dk/biotools/Seq2Logo-2.0/
 #=======================================================================
 #%% specify curr dir
 getwd()
 setwd('~/git/LSHTM_analysis/plotting_test/')
 getwd()
 #=======================================================================
 #%% load packages
 # header file
 header_dir = '~/git/LSHTM_analysis/'
 source(paste0(header_dir, '/', 'my_header.R'))
 #=======================================================================
 #%% variable assignment: input and output paths & filenames
 drug = 'pyrazinamide'
 gene = 'pncA'
 gene_match = paste0(gene,'_p.')
 cat(gene_match)
 #===========
 # data dir
 #===========
 datadir = paste0('~/git/Data')
 #===========
 # input
 #===========
 # source R script 'combining_two_df.R'
 #indir = paste0(datadir, '/', drug, '/', 'output') # reading files
 indir = '../meta_data_analysis' # sourcing R script
 in_filename = 'combining_df_ps.R'
 infile = paste0(indir, '/', in_filename)
 cat(paste0('Input is a R script: ', '\'', infile, '\'')
    , '\n========================================================')
 #===========
 # output
 #===========
 # 1) lineage dist of all muts
 outdir = paste0('~/git/Data', '/', drug, '/', 'output/plots') #same as indir
 #cat('Output dir: ', outdir, '\n')
 #file_type = '.svg'
 #out_filename1 = paste0(tolower(gene), '_lineage_dist_ps', file_type) 
 #outfile1 = paste0(outdir, '/', out_filename1)
 #cat(paste0('Output plot1 :', outfile1)
 #    , '\n========================================================')
 #%% end of variable assignment for input and output files
 #=======================================================================
 ##%% read input file
 cat('Reading input file(sourcing R script):', in_filename)
 source(infile)
 #==========================
 # This will return:
 # df with NA for pyrazinamide:
 # merged_df2
 # merged_df3
 # df without NA for pyrazinamide:
 # merged_df2_comp
 # merged_df3_comp
 #===========================
 ###########################
 # Data for plots
 # you need merged_df2 or merged_df2_comp
 # since this is one-many relationship 
 # i.e the same SNP can belong to multiple lineages
 # using the _comp dataset means
 # we lose some muts and at this level, we should use
 # as much info as available, hence use df with NA
 # This will the first plotting df
 # Then subset this to extract dr muts only (second plottig df)
 ###########################
 #%%%%%%%%%%%%%%%%%%%%%%%%%
 # uncomment as necessary
 # REASSIGNMENT
 #my_data = merged_df2
 #my_data =  merged_df2_comp
 #my_data = merged_df3
 my_data = merged_df3_comp
 #%%%%%%%%%%%%%%%%%%%%%%%%%%
 # delete variables not required
 rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
 # quick checks
 colnames(my_data)
 str(my_data)
 c1 = unique(my_data$Position) 
 nrow(my_data)
 cat('No. of rows in my_data:', nrow(my_data)
    , '\nDistinct positions corresponding to snps:', length(c1)
    , '\n===========================================================')
 #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 # FIXME: Think and decide what you want to remove
 # mut_pos_occurence < 1 or sample_pos_occurrence <1
 # get freq count of positions so you can subset freq<1
 require(data.table)
 #setDT(my_data)[, mut_pos_occurrence := .N, by = .(Position)] #265, 14
 #extract freq_pos>1
 #my_data_snp = my_data[my_data$occurrence!=1,]
 #u = unique(my_data_snp$Position) #73
 #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # REASSIGNMENT to prevent changing code
 my_data_snp = my_data
 #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 #=======================================================================
 #%% logo plots from dataframe
 #############
 # PLOTS: ggseqlogo with custom height
 # https://omarwagih.github.io/ggseqlogo/
 #############
 #require(ggplot2)
 #require(tidyverse)
 library(ggseqlogo)
 foo = my_data_snp[, c("Position", "Mutant_type","ratioDUET", "OR"
                    , "mut_prop_polarity", "mut_prop_water") ] 
 # log10OR
 # FIXME: at the source script (when calculating AFandOR)
 my_data_snp$log10or = log10(my_data_snp$OR)
 bar = my_data_snp[, c('Position', 'Mutant_type', 'OR', 'logor', 'log10or')]
 bar_or = my_data_snp[, c('Position', 'Mutant_type', 'OR')]
 wide_df_or <- bar_or %>% spread(Position, OR, fill = 0)
 wide_df_or = as.matrix(wide_df_or)
 rownames(wide_df_or) = wide_df_or[,1]
 wide_df_or = wide_df_or[,-1]
 # custom height (OR) logo plot: yayy works
 ggseqlogo(wide_df_or, method='custom', seq_type='aa') + ylab('my custom height') +
  theme(legend.position = "bottom"
        , axis.text.x = element_text(size = 11
                                     , angle = 90
                                     , hjust = 1
                                     , vjust = 0.4)
        , axis.text.y = element_text(size = 15
                                     , angle = 0
                                     , hjust = 1
                                     , vjust = 0))+
  labs(title = "AA logo plot"
       , x = "Wild-type Position"
       , y = "OR")
 #%% end of logo plot with OR as height
 #=======================================================================
 # extracting data with log10OR
 bar_logor = my_data_snp[, c('Position', 'Mutant_type', 'log10or')]
 wide_df_logor <- bar_logor %>% spread(Position, log10or, fill = 0)
 wide_df_logor = as.matrix(wide_df_logor)
 rownames(wide_df_logor) = wide_df_logor[,1]
 wide_df_logor = wide_df_logor[,-1]
 #  custom height (log10OR) logo plot: yayy works
 ggseqlogo(wide_df_logor, method='custom', seq_type='aa') + ylab('my custom height') +
  theme(legend.position = "bottom"
        , axis.text.x = element_text(size = 11
                                     , angle = 90
                                     , hjust = 1
                                     , vjust = 0.4)
        , axis.text.y = element_text(size = 15
                                     , angle = 0
                                     , hjust = 1
                                     , vjust = 0))+
  labs(title = "AA logo plot"
       , x = "Wild-type Position"
       , y = "Log10(OR)")
 #=======================================================================
 #%% logo plot from sequence
 #################
 # Plot: LOGOLAS (ED plots)
 # link: https://github.com/kkdey/Logolas
 # on all pncA samples: output of mutate.py
 ################
 library(Logolas)
 seqs = read.csv('~/git//Data/pyrazinamide/output/pnca_msa.txt'
                , header = FALSE
                , stringsAsFactors = FALSE)$V1
 # my_data: useful!
 logomaker(seqs, type = "EDLogo", color_type = 'per_symbol'
          , return_heights = TRUE)
 logomaker(seqs, type = "Logo", color_type = 'per_symbol')
 #%% end of script
 #=======================================================================