adding clean files for rerrun 35k dataset

2020-07-07 18:28:55 +01:00 · 2020-07-07 18:28:55 +01:00 · a7f21cfb14
commit a7f21cfb14
parent 943513a338
32 changed files with 157 additions and 44550 deletions
--- a/scripts/af_or_calcs.R
+++ b/scripts/af_or_calcs.R
@ -1,319 +0,0 @@
 #!/usr/bin/env Rscript                                                  
 #require('compare')
 require('getopt', quietly=TRUE) # We need to be able to parse arguments
 #########################################################
 # TASK: To calculate Allele Frequency and
 # Odds Ratio from master data
 # and add the calculated params to meta_data extracted from
 # data_extraction.py
 #########################################################
 #getwd()
 setwd('~/git/LSHTM_analysis/scripts')
 cat(c(getwd(),'\n'))
 # Command line args
 spec = matrix(c(
  "drug"   , "d", 1, "character",
  "gene"   , "g", 1, "character"
 ), byrow = TRUE, ncol = 4)
 opt = getopt(spec);
 drug = opt$drug
 gene = opt$gene
 if(is.null(drug)|is.null(gene)) {
    stop('Missing arguments: --drug and --gene must both be specified (case-sensitive)')
    }
 #options(scipen = 999) #disabling scientific notation in R.
 #options(scipen = 4)  
 #%% variable assignment: input and output paths & filenames
 drug = 'pyrazinamide'
 gene = 'pncA'
 gene_match = paste0(gene,'_p.')
 cat(gene_match)
 #===========
 # input
 #===========
 # infile1: Raw data
 #indir = 'git/Data/pyrazinamide/input/original'
 indir = paste0('~/git/Data')
 in_filename  = 'original_tanushree_data_v2.csv'
 #in_filename  = 'mtb_gwas_v3.csv'
 infile = paste0(indir, '/', in_filename)
 cat(paste0('Reading infile1: raw data', ' ', infile) )
 # infile2: gene associated meta data file to extract valid snps and add calcs to.
 # This is outfile3 from data_extraction.py
 indir_metadata = paste0('~/git/Data', '/', drug, '/', 'output')
 in_filename_metadata = 'pnca_metadata.csv'
 infile_metadata = paste0(indir_metadata, '/', in_filename_metadata)
 cat(paste0('Reading infile2: gene associated metadata:', infile_metadata))
 #===========
 # output
 #===========
 # outdir = 'git/Data/pyrazinamide/output'
 outdir = paste0('~/git/Data', '/', drug, '/', 'output')
 #out_filename = paste0(tolower(gene), '_meta_data_with_AF_OR.csv')
 out_filename = paste0(tolower(gene), '_af_or.csv')
 outfile = paste0(outdir, '/', out_filename)
 cat(paste0('Output file with full path:', outfile))
 #%% end of variable assignment for input and output files
 #########################################################
 # 1: Read master/raw data stored in Data/
 #########################################################
 raw_data_all = read.csv(infile, stringsAsFactors = F)
 # building cols to extract
 dr_muts_col = paste0('dr_mutations_', drug)
 other_muts_col = paste0('other_mutations_', drug)
 cat('Extracting columns based on variables:\n'
      , drug
      , '\n'
      , dr_muts_col
      , '\n'
      , other_muts_col
      , '\n===============================================================')
 raw_data = raw_data_all[,c("id"
                     , drug
                     , dr_muts_col
                     , other_muts_col)]
 rm(raw_data_all)
 rm(indir, in_filename, infile)
 #===========
 # 1a: exclude na
 #===========
 raw_data = raw_data[!is.na(raw_data[[drug]]),]
 total_samples = length(unique(raw_data$id))
 cat(paste0('Total samples without NA in', ' ', drug, 'is:', total_samples))
 # sanity check: should  be true
 is.numeric(total_samples) 
 #===========
 # 1b: combine the two mutation columns
 #===========
 all_muts_colname = paste0('all_mutations_', drug)
 #raw_data$all_mutations_pyrazinamide = paste(raw_data$dr_mutations_pyrazinamide, raw_data$other_mutations_pyrazinamide)
 raw_data[[all_muts_colname]] = paste(raw_data[[dr_muts_col]], raw_data[[other_muts_col]])
 head(raw_data[[all_muts_colname]])
 #===========
 # 1c: create yet another column that contains all the mutations but in lower case
 #===========
 head(raw_data[[all_muts_colname]])
 raw_data$all_muts_gene = tolower(raw_data[[all_muts_colname]]) 
 head(raw_data$all_muts_gene)
 # sanity checks
 #table(grepl("gene_p",raw_data$all_muts_gene))
 cat(paste0('converting gene match:', gene_match, ' ', 'to lowercase'))
 gene_match = tolower(gene_match)
 table(grepl(gene_match,raw_data$all_muts_gene))
 # sanity check: should be TRUE
 #sum(table(grepl("gene_p",raw_data$all_muts_gene))) == total_samples
 # sanity check
 if(sum(table(grepl(gene_match, raw_data$all_muts_gene))) == total_samples){
  cat('PASS: Total no. of samples match')
 } else{
  cat('FAIL: No. of samples mismatch')
 }
 #########################################################
 # 2: Read valid snps for which OR 
 # can be calculated
 #########################################################
 cat(paste0('Reading metadata infile:', infile_metadata))
 gene_metadata = read.csv(infile_metadata
                         #, file.choose()
                         , stringsAsFactors = F
                         , header = T)
 # clear variables
 rm(in_filename_metadata, infile_metadata)
 # count na in pyrazinamide column
 tot_pza_na = sum(is.na(gene_metadata$pyrazinamide))
 expected_rows = nrow(gene_metadata) - tot_pza_na
 # drop na from the pyrazinamide colum
 gene_snps_or = gene_metadata[!is.na(gene_metadata[[drug]]),]
 # sanity check
 if(nrow(gene_snps_or) == expected_rows){
  cat('PASS: no. of rows match with expected_rows')
 } else{
  cat('FAIL: nrows mismatch.')
 }
 # extract unique snps to iterate over for AF and OR calcs
 gene_snps_unique = unique(gene_snps_or$mutation) 
 cat(paste0('Total no. of distinct comp snps to perform OR calcs: ', length(gene_snps_unique)))
 #===========================================================================================
 #########################
 # custom chisq function: 
 # To calculate OR
 #########################
 i = "pnca_p.trp68gly"
 mut = grepl(i,raw_data$all_muts_gene)
 mut = as.numeric(mut)
 dst = raw_data[[drug]] 
 #x = as.numeric(mut)
 #y = dst
 mychisq_or = function(x,y){
  tab = as.matrix(table(x,y))
  a = tab[2,2]
  if (a==0){ a<-0.5}
  b = tab[2,1]
  if (b==0){ b<-0.5}
  c = tab[1,2]
  if (c==0){ c<-0.5}
  d = tab[1,1]
  if (d==0){ d<-0.5}
  (a/b)/(c/d)
 }
 or_mychisq = mychisq_or(dst, mut)
 print(paste0('mychisq OR:', or_mychisq ))
 #=====================================
 #OR calcs using the following 4
 #1) chisq.test
 #2) fisher
 #3) modified chisq.test
 #4) logistic
 #5) adjusted logistic?
 #6) kinship (separate script)
 #======================================
 # TEST FOR a few muts: sapply and df
 #===============================================
 snps <- gene_snps_unique # reassign so you test with subset of muts
 #snps <- gene_snps_unique[1:2]
 cat(paste0('Running calculations for:', length(snps), ' nssnps\n'
 , 'gene: ', gene
 , '\ndrug: ', drug ))
 # DV: pyrazinamide 0 or 1
 dst = raw_data[[drug]]
 # initialise an empty df
 ors_df = data.frame()
 x = sapply(snps,function(m){
  mut = grepl(m,raw_data$all_muts_gene)
  mut = as.numeric(mut)
  cat(paste0('Running mutation:', m, '\n'))
  model<-glm(dst ~ mut, family = binomial)
  #-------------------
  # allele frequency
  #-------------------
  afs = mean(mut)
  #-------------------
  # logistic model
  #-------------------
  beta_logistic = summary(model)$coefficients[2,1]
  or_logistic = exp(summary(model)$coefficients[2,1])
  #print(paste0('logistic OR:', or_logistic))
  pval_logistic = summary(model)$coefficients[2,4]
  #print(paste0('logistic pval:', pval_logistic))
  se_logistic = summary(model)$coefficients[2,2]
  #print(paste0('logistic SE:', se_logistic))
  zval_logistic = summary(model)$coefficients[2,3]
  #print(paste0('logistic zval:', zval_logistic))
  ci_mod = exp(confint(model))[2,]
  #print(paste0('logistic CI:', ci_mod))
  ci_lower_logistic = ci_mod[["2.5 %"]]
  ci_upper_logistic = ci_mod[["97.5 %"]]
  #-------------------
  # custom_chisq and fisher: OR p-value and CI
  #-------------------
  or_mychisq = mychisq_or(dst, mut)
  #print(paste0('mychisq OR:', or_mychisq))
  odds_fisher = fisher.test(table(dst, mut))$estimate
  or_fisher = odds_fisher[[1]]
  pval_fisher = fisher.test(table(dst, mut))$p.value
  ci_lower_fisher = fisher.test(table(dst, mut))$conf.int[1]
  ci_upper_fisher = fisher.test(table(dst, mut))$conf.int[2]
  #-------------------
  # chi sq estimates 
  #-------------------
  estimate_chisq = chisq.test(table(dst, mut))$statistic; estimate_chisq
  est_chisq = estimate_chisq[[1]]; print(est_chisq)
  pval_chisq = chisq.test(table(dst, mut))$p.value                                                        
  # build a row to append to df
  row = data.frame(mutation = m
                   , af = afs
                   , beta_logistic = beta_logistic
                   , or_logistic = or_logistic
                   , pval_logistic = pval_logistic
                   , se_logistic = se_logistic
                   , zval_logistic = zval_logistic
                   , ci_low_logistic = ci_lower_logistic
                   , ci_hi_logistic = ci_upper_logistic
                   , or_mychisq = or_mychisq 
                   , or_fisher = or_fisher
                   , pval_fisher = pval_fisher
                   , ci_low_fisher= ci_lower_fisher
                   , ci_hi_fisher = ci_upper_fisher
                   , est_chisq = est_chisq
                   , pval_chisq = pval_chisq
  )
  #print(row)
  ors_df <<- rbind(ors_df, row)
 })
 #%%======================================================
 # Writing file with calculated ORs and AFs
 cat(paste0('writing output file: '
           , '\nFilename: ', out_filename))
 write.csv(ors_df, outfile
          , row.names = F)
 cat(paste0('Finished writing:'
           , outfile
           , '\nNo. of rows: ', nrow(ors_df)
           , '\nNo. of cols: ', ncol(ors_df)))
 #************************************************
 cat('\n======================================================================\n')
 cat('End of script: calculated AF, OR, pvalues and saved file')
--- a/scripts/align.py
+++ b/scripts/align.py
@ -1,51 +0,0 @@
 #!/usr/bin/env python3
 from Bio import SeqIO
 from Bio import pairwise2
 from Bio.pairwise2 import format_alignment
 import re
 import os
 #%%
 def myalign(ref_seq, pdb_seq):
    myalign_dict = {}
    alignments = pairwise2.align.globalxx(ref_seq, pdb_seq)  
    #alignments = pairwise2.align.localxx(ref, struct)
    match = []
    for a, b in zip(alignments[0][0], alignments[0][1]):
            if a == b:
                    match.append('|')
            else:
                    match.append(' ')
    #print(match)
    print(alignments[0][0])
    print("".join(match))
    print(alignments[0][1])
    result_align = alignments[0][1]
    #print(result_align)
    print('===============================================================\n')
    # update dict
    myalign_dict.update({'aligned_fasta': result_align})
    # find start and end of match    
    aa_regex = '\w'
    m = re.search(aa_regex, result_align)
    #m = my_match.span()
    offset = m.start()
    offset_end = m.end()
    print('start of match:', offset
    , '\nend of match:', offset_end)
    print('===============================================================\n')
    # update dict
    myalign_dict.update({'start_match' : offset})
    myalign_dict.update({'end_match' : offset_end})
    return myalign_dict
--- a/scripts/chain_extract.py
+++ b/scripts/chain_extract.py
@ -1,24 +0,0 @@
 #!/usr/bin/python3
 #=======================================================================
 # TASK: select specified chains from the pdb & save a cropped PDB with
 # the selected chains. Useful for dimer, etc modelling.
 # link for saving each chain as a separate file
 # https://stackoverflow.com/questions/11685716/how-to-extract-chains-from-a-pdb-file
 #=======================================================================
 from Bio.PDB import PDBParser, PDBIO, Select
 # Select() Method to return True for every chain in 'chains'
 class ChainExtract(Select):
    def __init__(self, chain):
        self.chain = chain
    def accept_chain(self, chain):
        #print(dir(chain))
        if chain.id in self.chain:
            return 1
        else:
            return 0
--- a/scripts/chain_splitter.py
+++ b/scripts/chain_splitter.py
@ -1,28 +0,0 @@
 #!/usr/bin/python3
 #=======================================================================
 # TASK: extract chain from pdb and save each chain as a separate file
 # link for saving each chain as a separate file
 #=======================================================================   
 __description__ = \
 """
 pdb_chain_splitter.py
 extracts chains and saves them as separate pdb files.
 """
 __author__ = "Tanushree Tunstall"
 __date__ = ""
 from Bio.PDB import Select, PDBIO
 from Bio.PDB.PDBParser import PDBParser
 class ChainSelect(Select):
    def __init__(self, chain):
        self.chain = chain
    def accept_chain(self, chain):
        if chain.get_id() == self.chain:
            return 1
        else:          
            return 0
--- a/scripts/combining_FIXME.py
+++ b/scripts/combining_FIXME.py
@ -1,177 +0,0 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 '''
 Created on Tue Aug  6 12:56:03 2019
@author: tanu
 '''
 # FIXME: change filename  2(mcsm normalised data)
 # to be consistent like (pnca_complex_mcsm_norm.csv) : changed manually, but ensure this is done in the mcsm pipeline
 #=======================================================================
 # Task: combine 2 dfs on comm_valson cols by detecting them
 # includes sainity checks
 #=======================================================================
 #%% load packages
 import sys, os
 import pandas as pd
 import numpy as np
 import re
 #from varname import nameof
 #%% end of variable assignment for input and output files
 #=======================================================================
 #%% function/methd to combine dfs
 def detect_common_cols (df1, df2):
    """
    Detect comm_valson cols
    @param df1: df
    @type df1: pandas df
    @param df2: df
    @type df2: pandas df
    @return: comm_valson cols
    @type: list
    """
    common_cols  = np.intersect1d(df1.columns, df2.columns).tolist()
    print('Length of comm_cols:', len(common_cols)
          , '\nmerging column/s:', common_cols
          , '\ntype:', type(common_cols)
          , '\ndtypes in merging columns:\n', df1[common_cols].dtypes)
    return common_cols
 #%% Function to combine 2 dfs by detecting commom cols and performing 
 # sanity checks on the output df
 def combine_dfs_with_checks(df1, df2, my_join = 'outer'):
    """
    Combine 2 dfs by finding merging columns automatically
    @param df1: data frame
    @type df1: pandas df
    @param df2: data frame
    @type df2: pandas df
    @my_join: join type for merging
    @type my_join: string
    @return:  combined_df
    @type: pandas df
    """
    print('Finding comm_cols and merging cols:'
          ,'\n=========================================================')
    common_cols  = np.intersect1d(df1.columns, df2.columns).tolist()
    print('Length of comm_cols:', len(common_cols)
          , '\nmerging column/s:', common_cols
          , '\ntype:', type(common_cols))
    #print('\ndtypes in merging columns:\n', df1[common_cols].dtypes)
    print('selecting consistent dtypes for merging (object i.e string)')
    #merging_cols = df1[comm_valson_cols].select_dtypes(include = [object]).columns.tolist()
    #merging_cols = df1[comm_valson_cols].select_dtypes(include = ['int64']).columns.tolist()
    merging_cols = common_cols.copy()
    nmerging_cols = len(merging_cols)
    print(' length of merging cols:', nmerging_cols
      , '\nmerging cols:', merging_cols, 'type:', type(merging_cols)
      , '\n=========================================================')
    #========================
    # merge 1 (combined_df)
    # concatenating 2dfs:
    # df1, df2
    #========================
    # checking cross-over of mutations in the two dfs to merge
    ndiff_1 = df1[merging_cols].squeeze().isin(df2[merging_cols].squeeze()).sum()
    ndiff1 =  df1.shape[0] - ndiff_1
    print('There are', ndiff1, 'unmatched mutations in left df')
    #missing_mutinfo = df1[~left_df['mutationinformation'].isin(df2['mutationinformation'])]
    #missing_mutinfo.to_csv('infoless_muts.csv')
    ndiff_2 = df2[merging_cols].squeeze().isin(df1[merging_cols].squeeze()).sum()
    ndiff2 = df2.shape[0] - ndiff_2
    print('There are', ndiff2, 'unmatched mutations in right_df')
    #comm_vals = np.intersect1d(df1[merging_cols], df2[merging_cols])
    #comm_vals_count = len(comm_vals)
    #print('length of comm_valson values:', comm_vals_count , '\ntype:', type(comm_vals_count))
    #========================
    # merging dfs & sanity checks
    #========================
    fail = False 
    print('combing with:', my_join)
    comb_df = pd.merge(df1, df2, on = merging_cols, how = my_join)
    expected_cols = df1.shape[1] + df2.shape[1] - nmerging_cols
    if my_join == 'right':
        df2_nd = df2.drop_duplicates(merging_cols, keep = 'first')
        expected_rows = df2_nd.shape[0]
    if my_join == 'left':
        expected_rows = df1.shape[0]
    #if my_join == 'inner':
    #    expected_rows = comm_vals_count
    #if my_join == 'outer':
    #    df1_nd = df1.drop_duplicates(merging_cols, keep = 'first')
    #    df2_nd = df2.drop_duplicates(merging_cols, keep = 'first')
    #    expected_rows = df1_nd.shape[0] + df2_nd.shape[0] - comm_vals_count
    if my_join == ('inner' or 'outer') and len(merging_cols) > 1:
        #comm_vals = np.intersect1d(df1['mutationinformation'], df2['mutationinformation'])
        print('length of merging_cols > 1, therefore omitting row checks')
        combined_df = comb_df.copy()
        expected_rows = len(combined_df)    
    else:
        comm_vals = np.intersect1d(df1[merging_cols], df2[merging_cols])
        print('length of merging_cols == 1, calculating expected rows in merged_df')
        combined_df = comb_df.drop_duplicates(subset = merging_cols, keep ='first')
        if my_join == 'inner':
            expected_rows = len(comm_vals)
        if my_join == 'outer':
            df1_nd = df1.drop_duplicates(merging_cols, keep = 'first')
            df2_nd = df2.drop_duplicates(merging_cols, keep = 'first')
            expected_rows = df1_nd.shape[0] + df2_nd.shape[0] - len(comm_vals)  
    if len(combined_df) == expected_rows and len(combined_df.columns) == expected_cols:
        print('PASS: successfully combined dfs with:', my_join, 'join')
    else:
        print('FAIL: combined_df\'s expected rows and cols not matched')
        fail = True 
    print('\nExpected no. of rows:', expected_rows
          , '\nGot:', len(combined_df)
          , '\nExpected no. of cols:', expected_cols
          , '\nGot:', len(combined_df.columns))
    if fail:
        sys.exit()
    #if clean:
    #foo = combined_df2.filter(regex = r'.*_x|_y', axis = 1)
    #print(foo.columns)
    #print('Detected duplicate cols with suffix: _x _y'
    #      , '\Dropping duplicate cols and cleaning')
    # drop position col containing suffix '_y' and then rename col without suffix
    combined_df_clean = combined_df.drop(combined_df.filter(regex = r'.*_y').columns, axis = 1)
    combined_df_clean.rename(columns=lambda x: re.sub('_x$','', x), inplace = True)
    return combined_df_clean
 #%% end of function
 #=======================================================================
--- a/scripts/combining_dfs.py
+++ b/scripts/combining_dfs.py
@ -1,287 +0,0 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 '''
 Created on Tue Aug  6 12:56:03 2019
@author: tanu
 '''
 # FIXME: change filename  2(mcsm normalised data)
 # to be consistent like (pnca_complex_mcsm_norm.csv) : changed manually, but ensure this is done in the mcsm pipeline
 #=======================================================================
 # Task: combine 2 dfs with aa position as linking column
 # Input: 2 dfs
 #   <gene.lower()>_complex_mcsm_norm.csv
 #   <gene.lower()>_foldx.csv
 # Output: .csv of all 2 dfs combined
 # useful link
 # https://stackoverflow.com/questions/23668427/pandas-three-way-joining-multiple-dataframes-on-columns
 #=======================================================================
 #%% load packages
 import sys, os
 import pandas as pd
 import numpy as np
 #from varname import nameof
 import argparse
 #=======================================================================
 #%% specify input and curr dir
 homedir = os.path.expanduser('~')
 # set working dir
 os.getcwd()
 os.chdir(homedir + '/git/LSHTM_analysis/scripts')
 os.getcwd()
 # FIXME: local imports
 #from combining import combine_dfs_with_checks
 from combining_FIXME import detect_common_cols
 #=======================================================================
 #%% command line args
 #arg_parser = argparse.ArgumentParser()
 #arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide')
 #arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive
 #args = arg_parser.parse_args()
 #=======================================================================
 #%% variable assignment: input and output 
 drug = 'pyrazinamide'
 gene = 'pncA'
 gene_match = gene + '_p.'
 #drug = args.drug
 #gene = args.gene
 #======
 # dirs
 #======
 datadir = homedir + '/' + 'git/Data'
 indir = datadir + '/' + drug + '/' + 'input'
 outdir =   datadir + '/' + drug + '/' + 'output'
 #=======
 # input
 #=======
 in_filename_mcsm = gene.lower() + '_complex_mcsm_norm.csv'
 in_filename_foldx = gene.lower() + '_foldx.csv'
 in_filename_dssp = gene.lower() + '_dssp.csv'
 in_filename_kd = gene.lower() + '_kd.csv'
 in_filename_rd = gene.lower() + '_rd.csv'
 in_filename_snpinfo = 'ns' + gene.lower() + '_snp_info.csv'
 in_filename_afor = gene.lower() + '_af_or.csv'
 in_filename_afor_kin = gene.lower() + '_af_or_kinship.csv'
 infile_mcsm = outdir + '/' + in_filename_mcsm
 infile_foldx = outdir + '/' + in_filename_foldx
 infile_dssp = outdir + '/' + in_filename_dssp
 infile_kd = outdir + '/' + in_filename_kd
 infile_rd = outdir + '/' + in_filename_rd
 infile_snpinfo = indir + '/' + in_filename_snpinfo 
 infile_afor = outdir + '/' + in_filename_afor
 infile_afor_kin = outdir + '/' + in_filename_afor_kin
 print('\nInput path:', outdir
      , '\nInput filename mcsm:', infile_mcsm
      , '\nInput filename foldx:', infile_foldx
      , '\nInput filename dssp:', infile_dssp
      , '\nInput filename kd:', infile_kd
      , '\nInput filename rd', infile_rd
      , '\nInput filename snp info:', infile_snpinfo
      , '\nInput filename af or:', infile_afor
      , '\nInput filename afor kinship:', infile_afor_kin
      , '\n============================================================')
 #=======
 # output 
 #=======
 out_filename_comb = gene.lower() + '_all_params.csv'
 outfile_comb =  outdir + '/' + out_filename_comb
 print('Output filename:', outfile_comb
      , '\n============================================================')
 o_join = 'outer'
 l_join = 'left'
 r_join = 'right'
 i_join = 'inner'
 # end of variable assignment for input and output files
 #&%%====================================================================  
 mcsm_df =  pd.read_csv(infile_mcsm, sep = ',')
 mcsm_df.columns = mcsm_df.columns.str.lower()
 foldx_df =  pd.read_csv(infile_foldx , sep = ',')
 print('==================================='
      , '\nFirst merge: mcsm + foldx'
      , '\n===================================')
 #mcsm_foldx_dfs = combine_dfs_with_checks(mcsm_df, foldx_df, my_join = o_join)
 merging_cols_m1  = detect_common_cols(mcsm_df, foldx_df)
 mcsm_foldx_dfs = pd.merge(mcsm_df, foldx_df, on = merging_cols_m1,  how = 'outer')
 ncols_m1 = len(mcsm_foldx_dfs.columns)
 #%%~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 print('==================================='
      , '\nSecond merge: dssp + kd'
      , '\n===================================')
 dssp_df = pd.read_csv(infile_dssp, sep = ',')
 kd_df = pd.read_csv(infile_kd, sep = ',')
 rd_df = pd.read_csv(infile_rd, sep = ',')
 #dssp_kd_dfs = combine_dfs_with_checks(dssp_df, kd_df, my_join = o_join)
 merging_cols_m2 = detect_common_cols(dssp_df, kd_df)
 dssp_kd_dfs = pd.merge(dssp_df, kd_df, on = merging_cols_m2,  how = 'outer')
 print('==================================='
      , '\nThird merge: dssp_kd_dfs + rd_df'
      , '\n===================================')
 #dssp_kd_rd_dfs = combine_dfs_with_checks(dssp_kd_dfs, rd_df, my_join = o_join)
 merging_cols_m3 = detect_common_cols(dssp_df, kd_df)
 dssp_kd_rd_dfs = pd.merge(dssp_kd_dfs, rd_df, on = merging_cols_m3, how = 'outer')
 ncols_m3 = len(dssp_kd_rd_dfs.columns)
 #%%~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 print('==================================='
      , '\nFourth merge: First merge + Third merge'
      , '\n===================================')
 #combined_dfs = combine_dfs_with_checks(mcsm_foldx_dfs, dssp_kd_rd_dfs, my_join = i_join)#  gives wrong!
 merging_cols_m4 = detect_common_cols(mcsm_foldx_dfs, dssp_kd_rd_dfs)
 combined_df_expected_cols = ncols_m1 + ncols_m3 - len(merging_cols_m4)
 combined_df = pd.merge(mcsm_foldx_dfs, dssp_kd_rd_dfs, on = merging_cols_m4, how  = 'inner')
 if len(combined_df) == len(mcsm_df) and len(combined_df.columns) == combined_df_expected_cols:
    print('PASS: successfully combined 5 dfs'
          , '\nnrows combined_df:', len(combined_df)
          , '\ncols combined_df:', len(combined_df.columns))   
 else:
    sys.exit('FAIL: check individual df merges')
 #%%~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~   
 #%%~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #%% OR combining
 afor_df = pd.read_csv(infile_afor, sep = ',')
 afor_df.columns = afor_df.columns.str.lower()
 if afor_df['mutation'].shape[0] == afor_df['mutation'].nunique():
    print('No duplicate muts detected in afor_df')
 else:
    print('Dropping duplicate muts detected in afor_df')
    afor_df = afor_df.drop_duplicates(subset = 'mutation', keep = 'first')
 snpinfo_df_all = pd.read_csv(infile_snpinfo, sep = ',') 
 snpinfo_df = snpinfo_df_all[['mutation', 'mutationinformation']]
 if snpinfo_df['mutation'].shape[0] == snpinfo_df['mutation'].nunique():
    print('No duplicate muts detected in snpinfo_df')
 else:
    dups = snpinfo_df['mutation'].duplicated().sum()
    print( dups, 'Duplicate muts detected in snpinfo_df'
          , '\nDim:', snpinfo_df.shape)
    print('Dropping duplicate muts')
    snpinfo_df = snpinfo_df.drop_duplicates(subset = 'mutation', keep = 'first')
    print('Dim:', snpinfo_df.shape)
 print('==================================='
      , '\nFifth merge: afor_df + snpinfo_df'
      , '\n===================================')
 merging_cols_m5 = detect_common_cols(afor_df, snpinfo_df)
 afor_snpinfo_dfs = pd.merge(afor_df, snpinfo_df, on = merging_cols_m5, how = 'left')
 if len(afor_snpinfo_dfs) == afor_df.shape[0]:
    print('PASS: succesfully combined with left join'
    , '\nDim of df1:', afor_df.shape
    , '\nDim of df2:', snpinfo_df.shape)
 else:
    sys.exit('FAIL: unsuccessful merge')
 #%%~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~     
 afor_kin_df = pd.read_csv(infile_afor_kin, sep = ',')
 afor_kin_df.columns = afor_kin_df.columns.str.lower()
 print('==================================='
      , '\nSixth merge: afor_snpinfo_dfs + afor_kin_df'
      , '\n===================================')
 merging_cols_m6 = detect_common_cols(afor_snpinfo_dfs, afor_kin_df)
 print('Dim of df1:', afor_snpinfo_dfs.shape
      , '\nDim of df2:', afor_kin_df.shape
      , '\nno. of merging_cols:', len(merging_cols_m6))
 ors_df = pd.merge(afor_snpinfo_dfs, afor_kin_df, on = merging_cols_m6, how = 'outer')
 print('Dim of ors_df:', ors_df.shape)
 #%%~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 print('==================================='
      , '\nSeventh merge: combined_df + ors_df'
      , '\n===================================')
 merging_cols_m7 = detect_common_cols(combined_df, ors_df)
 print('Dim of df1:', combined_df.shape
      , '\nDim of df2:', ors_df.shape
      , '\nno. of merging_cols:', len(merging_cols_m7))
 print('checking mutations in the two dfs:'
      , '\nmuts in df1 but NOT in df2:'
      , combined_df['mutationinformation'].isin(ors_df['mutationinformation']).sum()
      , '\nmuts in df2 but NOT in df1:'
      , ors_df['mutationinformation'].isin(combined_df['mutationinformation']).sum())
 #print('\nNo. of common muts:', np.intersect1d(combined_df['mutationinformation'], ors_df['mutationinformation']) )
 #combined_df_all = pd.merge(combined_df, ors_df, on = merging_cols_m7, how = 'outer') # FIXME
 combined_df_all = pd.merge(combined_df, ors_df, on = merging_cols_m7, how = 'left')
 outdf_expected_rows = len(combined_df)
 outdf_expected_cols = len(combined_df.columns) + len(ors_df.columns) - len(merging_cols_m7)
 print('\nDim of combined_df_all:', combined_df_all.shape
    , '\nwith join type: ????')
 if combined_df_all.shape[1] == outdf_expected_cols:
    print('combined_df has expected no. of cols')
 if combined_df_all.shape[0] == outdf_expected_rows:
    print('combined_df has expected no. of rows')
 else:
    print('WARNING: nrows discrepancy noted'
          , '\nFIX IT')
 print ('thing finished')
 #%%~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 # write csv
 combined_df_all.to_csv(outfile_comb, index = False)
 #=======================================================================
 #%% incase you FIX the the function: combine_dfs_with_checks
 #def main():
 #   print('Reading input files:')
    #mcsm_df =  pd.read_csv(infile_mcsm, sep = ',')
    #mcsm_df.columns = mcsm_df.columns.str.lower()
    #foldx_df =  pd.read_csv(infile_foldx , sep = ',')
    #dssp_df = pd.read_csv(infile_dssp, sep = ',')
    #dssp_df.columns = dssp_df.columns.str.lower()
    #kd_df = pd.read_csv(infile_kd, sep = ',')
    #kd_df.columns = kd_df.columns.str.lower()
    #rd_df = pd.read_csv(infile_kd, sep = ',')
 #if __name__ == '__main__':
 #    main()
 #======================================================================= 
 #%% end of script
--- a/scripts/commandArgs-example.r
+++ b/scripts/commandArgs-example.r
@ -1,9 +0,0 @@
 #!/usr/bin/env Rscript
 print('R Argument Test')
 cmd <- paste(commandArgs(), collapse=" ")
 cat("How R was invoked:\n");
 cat(cmd, "\n")
 args <- commandArgs(trailingOnly = TRUE)
 cat(c('Command Line Arguments supplied: ', args))
--- a/scripts/data_extraction.py
+++ b/scripts/data_extraction.py
@ -21,6 +21,9 @@ Created on Tue Aug  6 12:56:03 2019
 # where each row is a separate mutation
 # sample ids AND mutations are NOT unique, but the COMBINATION (sample id + mutation) = unique
 # NOTE
 #drtype is renamed to 'resistance' in the 35k dataset
 # output files: all lower case
 # 0) <gene>_common_ids.csv
 # 1) <gene>_ambiguous_muts.csv
@ -60,6 +63,7 @@ os.getcwd()
 # import aa dict
 from reference_dict import my_aa_dict # CHECK DIR STRUC THERE!
 from tidy_split import tidy_split
 #=======================================================================
 #%% command line args
 arg_parser = argparse.ArgumentParser()
@ -96,8 +100,8 @@ datadir = homedir + '/' + 'git/Data'
 #=======
 # input
 #=======
-in_filename  = 'original_tanushree_data_v2.csv'
+#in_filename  = 'original_tanushree_data_v2.csv' #19k
-#in_filename  = 'mtb_gwas_v3.csv'
+in_filename  = 'mtb_gwas_meta_v3.csv' #33k
 infile = datadir + '/' + in_filename
 print('Input file: ', infile
      , '\n============================================================')
@ -121,17 +125,45 @@ master_data  = pd.read_csv(infile, sep = ',')
 #list(master_data.columns)
 # extract elevant columns to extract from meta data related to the drug
 meta_data = master_data[['id'
       ,'country'
       ,'lineage'
       ,'sublineage'
       ,'drtype'
       , drug
       , dr_muts_col
       , other_muts_col
        ]] 
-del(master_data)
+#meta_data_ch = master_data[['id'
 #, 'country'
 #, 'lineage'
 #, 'sublineage'
 ##, 'drtype' #19k only
 #, 'resistance'
 #, drug
 #, dr_muts_col
 #, other_muts_col]] 
 core_cols = ['id'
    , 'country'
    , 'country2'
    , 'geographic_source'
    , 'region'
    , 'date'
    , 'strain'
    , 'lineage'
    , 'sublineage' #drtype renamed to resistance
    , 'resistance'
    , 'location'
    , 'host_body_site'
    , 'environment_material'
    , 'host_status'
    , 'hiv_status'
    , 'HIV_status'
    , 'isolation_source']
 variable_based_cols = [drug
                       , dr_muts_col
                       , other_muts_col]
 cols_to_extract = core_cols + variable_based_cols
 meta_data = master_data[cols_to_extract]  
 del(master_data, variable_based_cols, cols_to_extract)
 # checks and results
 total_samples = meta_data['id'].nunique() 
@ -269,14 +301,23 @@ print('gene to extract:', gene_match )
 #===============
 # FIXME: replace drug with variable containing the drug name
 # !!! important !!!
-meta_data_dr = meta_data[['id'
+#meta_data_dr = meta_data[['id'
-       ,'country'
+#       ,'country'
-       ,'lineage'
+#       ,'lineage'
-       ,'sublineage'
+#       ,'sublineage'
-       ,'drtype'
+#       ,'drtype'
-       , drug
+#       , drug
-       , dr_muts_col
+#       , dr_muts_col
-        ]] 
+#        ]] 
 dr_based_cols = [drug, dr_muts_col]
 cols_to_extract = core_cols + dr_based_cols
 meta_data_dr = meta_data[cols_to_extract]
 del(dr_based_cols, cols_to_extract)
 print('expected dim should be:', len(meta_data), (len(meta_data.columns)-1) )
 print('actual dim:', meta_data_dr.shape
 	, '\n===============================================================')
@ -306,14 +347,22 @@ dr_id = pd.Series(dr_id)
 print('Extracting dr_muts from:', other_muts_col,'with other meta_data')
 # FIXME: replace drug with variable containing the drug name
 # !!! important !!!
-meta_data_other = meta_data[['id'
+#meta_data_other = meta_data[['id'
-       ,'country'
+#       ,'country'
-       ,'lineage'
+#       ,'lineage'
-       ,'sublineage'
+#       ,'sublineage'
-       ,'drtype'
+##       ,'drtype'
-       , drug
+#       , drug
-       , other_muts_col
+#       , other_muts_col
-        ]] 
+#        ]] 
 dr_based_cols = [drug, other_muts_col]
 cols_to_extract = core_cols + dr_based_cols
 meta_data_other = meta_data[cols_to_extract]
 del(dr_based_cols, cols_to_extract)
 print('expected dim should be:', len(meta_data), (len(meta_data.columns)-1) )
 print('actual dim:', meta_data_other.shape
@ -373,7 +422,7 @@ print('Writing file:'
      , '\nExpected no. of rows:', len(common_ids)
      , '\n=============================================================')
-common_ids.to_csv(outfile0)
+common_ids.to_csv(outfile0, index = False)
 del(out_filename0)
 # clear variables
@ -419,44 +468,15 @@ print('This is still dirty data: samples have ', gene_match, 'muts but may have
 #https://stackoverflow.com/questions/41476150/removing-space-from-dataframe-columns-in-pandas
 print('Performing tidy_split(): to separate the mutations into indivdual rows')
 # define the split function
 def tidy_split(df, column, sep='|', keep=False):
    '''
    Split the values of a column and expand so the new DataFrame has one split
    value per row. Filters rows where the column is missing.
    Params
    ------
    df : pandas.DataFrame
        dataframe with the column to split and expand
    column : str
        the column to split and expand
    sep : str
        the string used to split the column's values
    keep : bool
        whether to retain the presplit value as it's own row
    Returns
    -------
    pandas.DataFrame
        Returns a dataframe with the same columns as `df`.
    '''
    indexes = list()
    new_values = list()
    #df = df.dropna(subset=[column])#!!!!-----see this incase you need to uncomment based on use case
    for i, presplit in enumerate(df[column].astype(str)):
        values = presplit.split(sep)
        if keep and len(values) > 1:
            indexes.append(i)
            new_values.append(presplit)
        for value in values:
            indexes.append(i)
            new_values.append(value)
    new_df = df.iloc[indexes, :].copy()
    new_df[column] = new_values
    return new_df
-#%% end of tidy_split()
+#TIDY SPLIT HERE
 #=========
 # DF1: dr_muts_col
 #=========
@ -761,12 +781,11 @@ del(c1, c2, col_to_split1, col_to_split2, comp_gene_samples, dr_WF0, dr_df, dr_m
 out_filename1 = gene.lower() + '_ambiguous_muts.csv'
 outfile1 = outdir + '/' + out_filename1
 print('Writing file: ambiguous muts'
-      , '\nFilename:', out_filename1
+      , '\nFilename:', outfile1)
      , '\nPath:',  outdir)
 #common_muts = ['gene_matchVal180Phe','gene_matchGln10Pro'] # test
 inspect = gene_LF1[gene_LF1['mutation'].isin(common_muts)]
-inspect.to_csv(outfile1)
+inspect.to_csv(outfile1, index = False)
 print('Finished writing:', out_filename1
      , '\nNo. of rows:', len(inspect)
@ -1069,13 +1088,13 @@ else:
    print('FAIL: SNP has NA, Possible mapping issues from dict?'
          , '\nDebug please!'
          , '\n=========================================================')
 sys.exit()
 out_filename2 = gene.lower() + '_mcsm_snps.csv'
 outfile2 = outdir + '/' + out_filename2
 print('Writing file: mCSM style muts'
-      , '\nFilename:', out_filename2
+      , '\nFilename:', outfile2
      , '\nPath:', outdir
      , '\nmutation format (SNP): {WT}<POS>{MUT}'
      , '\nNo. of distinct muts:', len(snps_only)
      , '\nNo. of distinct positions:', len(pos_only)
@ -1083,7 +1102,7 @@ print('Writing file: mCSM style muts'
 snps_only.to_csv(outfile2, header = False, index = False)
-print('Finished writing:', out_filename2
+print('Finished writing:', outfile2
      , '\nNo. of rows:', len(snps_only)
      , '\nNo. of cols:', len(snps_only.columns)
      , '\n=============================================================')
@ -1099,7 +1118,7 @@ print('Writing file: LF formatted data'
      , '\n============================================================')
 gene_LF1.to_csv(outfile3, header = True, index = False)
-print('Finished writing:', out_filename3
+print('Finished writing:', outfile3
      , '\nNo. of rows:', len(gene_LF1)
      , '\nNo. of cols:', len(gene_LF1.columns)
      , '\n=============================================================')
@ -1118,11 +1137,11 @@ all_muts_msa.columns.dtype
 all_muts_msa_sorted = all_muts_msa.sort_values(by = 'mutationinformation')
 # create an extra column with protein name
-all_muts_msa_sorted = all_muts_msa_sorted.assign(fasta_name = '3PL1') 
+#all_muts_msa_sorted = all_muts_msa_sorted.assign(fasta_name = '3PL1') 
-all_muts_msa_sorted.head()
+#all_muts_msa_sorted.head()
 # rearrange columns so the fasta name is the first column (required for mutate.script)
-all_muts_msa_sorted = all_muts_msa_sorted[['fasta_name', 'mutationinformation']]
+#all_muts_msa_sorted = all_muts_msa_sorted[['fasta_name', 'mutationinformation']]
 all_muts_msa_sorted.head()
 print('Checking NA in snps...')# should be 0
@ -1138,15 +1157,14 @@ out_filename4 = gene.lower() +'_all_muts_msa.csv'
 outfile4 = outdir + '/' + out_filename4
 print('Writing file: mCSM style muts for msa',
-      '\nFilename:', out_filename4,
+      '\nFilename:', outfile4,
      '\nPath:', outdir,
      '\nmutation format (SNP): {WT}<POS>{MUT}',
      '\nNo.of lines of msa:', len(all_muts_msa),  
 )
 all_muts_msa_sorted.to_csv(outfile4, header = False, index = False)
-print('Finished writing:', out_filename4
+print('Finished writing:', outfile4
      , '\nNo. of rows:', len(all_muts_msa)
      , '\nNo. of cols:', len(all_muts_msa.columns)
      , '\n=============================================================')
@ -1177,7 +1195,7 @@ print('Writing file: mutational positions'
 pos_only_sorted.to_csv(outfile5, header = True, index = False)
-print('Finished writing:', out_filename5
+print('Finished writing:', outfile5
      , '\nNo. of rows:', len(pos_only_sorted)
      , '\nNo. of cols:', len(pos_only_sorted.columns)
      , '\n=============================================================')
--- a/scripts/dssp_df.py
+++ b/scripts/dssp_df.py
@ -1,218 +0,0 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Tue Apr  7 09:30:16 2020
@author: tanu
 """
 #=======================================================================
 # TASK:
 #=======================================================================
 #%% load packages
 import sys, os
 import argparse
 import re
 import pandas as pd
 from Bio.PDB import PDBParser
 from Bio.PDB.DSSP import DSSP
 import dms_tools2
 import dms_tools2.dssp
 import pprint as pp
 #=======================================================================
 #%% specify homedir and curr dir
 homedir = os.path.expanduser('~') 
 # set working dir
 os.getcwd()
 os.chdir(homedir + '/git/LSHTM_analysis/scripts')
 os.getcwd()
 #=======================================================================
 #%% command line args
 arg_parser = argparse.ArgumentParser()
 arg_parser.add_argument('-d', '--drug', help='drug name', default = None)
 arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = None) # case sensitive
 args = arg_parser.parse_args()
 #=======================================================================
 #%% variable assignment: input and output 
 #drug = 'pyrazinamide'
 #gene = 'pncA'
 #gene_match = gene + '_p.'
 #drug = 'isoniazid'
 #gene = 'katG'
 #drug = 'cycloserine'
 #gene = 'alr'
 drug = args.drug
 gene = args.gene
 gene_match = gene + '_p.'
 #==========
 # data dir
 #==========
 #indir = 'git/Data/pyrazinamide/input/original'
 datadir = homedir + '/' + 'git/Data'
 #=======
 # input
 #=======
 #indir = datadir + '/' + drug + '/' + 'output'
 indir = datadir + '/' + drug + '/' + 'input'
 in_filename = gene.lower() + '_complex' + '.pdb'
 #in_filename = 'katg_complex.pdb' # fixme for pnca(consistent filenames i.e pnca_complex.pdb)
 infile =  indir + '/' + in_filename
 #=======
 # output 
 #=======
 outdir = datadir + '/' + drug + '/' + 'output'
 print('Output path:', outdir)
 #out_filename = os.path.splitext(in_filename)[0]+'.dssp' # strip file ext
 dssp_filename = gene.lower() + '.dssp'
 dssp_file = outdir + '/' +  dssp_filename
 print('Output dssp:', dssp_file)
 dsspcsv_filename = gene.lower() + '_dssp.csv'
 dsspcsv_file =  outdir + '/' + dsspcsv_filename
 print('Outfile dssp to csv: ', dsspcsv_file
      , '\n=============================================================')
 #%% end of variable assignment for input and output files
 #=======================================================================
 #%% create .dssp from pdb
 def dssp_file_from_pdb(inputpdbfile, outfile, DSSP = "dssp"):
    """
    Create a DSSP file from a PDB file
    @param inputpdbfile: pdb file
    @type inputpdbfile: string
    @param outfile: dssp file
    @type outfile: string
    @param DSSP: DSSP executable (argument to os.system)
    @type DSSP: string
    @return: none, creates dssp file
    """
 #    out_file = infile +'.dssp'
 #    outfile = os.path.splitext(inputpdbfile)[0]+'.dssp' # strip file ext
    os.system("%s -i %s -o %s" % (DSSP, inputpdbfile, outfile))
 #=======================================================================
 #%% extract chain id from dssp
 #print(dssp.keys())
 #print(dssp.keys()[0][0])
 #print(len(dssp))
 #print(dssp.keys()[0][0])
 #print(dssp.keys()[len(dssp)-1][0])
 def extract_chain_dssp(inputpdbfile):
    """
    extracts chain_ids from dssp run on pdb file
    This is to allow processing of dssp output to df 
    and for writing as csv file
    Parameters
    ----------
    @param inputpdbfile: pdb file
    @type inputpdbfile: string
    Returns
    -------
    @return: chain_ids from running dssp on pdb file
    @type list
    """
    p = PDBParser()
    structure = p.get_structure(in_filename, infile)
    model = structure[0]
    dssp = DSSP(model, infile)
    dssp_chains = []    
    for num_aa in range(0, len(dssp)):
 #        print(num_aa)
       # extract the chain id only and append to a list
        dssp_chains.append(dssp.keys()[num_aa][0])
        chainsL = list(set(dssp_chains))
    print(chainsL)
    # sort the list (since sets are not ordered) for convenience
    # this will be required for dssp_df
    pdbchainlist = sorted(chainsL)
    print('dssp output for'
          , in_filename, 'contains:', len(pdbchainlist)
          , 'chains:\n', pdbchainlist)
    return pdbchainlist
 #=======================================================================
 #%% write csv of processed dssp output
 def dssp_to_csv(inputdsspfile, outfile, pdbchainlist = ['A']):
    """
    Create a df from a dssp file containing ASA, RSA, SS for all chains
    @param infile: dssp file
    @type infile: string
    @param outfile: csv file
    @type outfile: string
    @param DSSP: DSSP to df processing using dmstools
    @type DSSP: string
    @return: none, creates csv file
        """
    dssp_df =  pd.DataFrame()
    print('Total no. of chains: ', len(pdbchainlist))
    for chain_id in pdbchainlist:
        print('Chain id:', chain_id)
        dssp_cur = pd.DataFrame()
        dssp_cur = dms_tools2.dssp.processDSSP(inputdsspfile, chain = chain_id)
        #!!!Important!!!
        dssp_cur['chain_id'] = chain_id 
        dssp_df = dssp_df.append(dssp_cur)
        pp.pprint(dssp_df)
    # Rename column (amino acid) as 'wild_type' and (site} as 'position' 
    # to be the same names as used in the file required for merging later.
    dssp_df.columns
    dssp_df.rename(columns = {'site':'position', 'amino_acid':'wild_type_dssp'}, inplace = True)
    dssp_df.columns
    # sanity check
 #    if len(dssp_df) == len(dssp):
 #        print('PASS: length of dssp_df has correct length')
 #    else:
 #        print('FAIL: length mismatch for dssp_df'
 #              , '\nexpected length:', len(dssp)
 #              , '\nGot length:', len(dssp_df)
 #              , 'Debug please!')
    # write to csv
    dssp_df.to_csv(outfile, header=True, index = False)
    print('Finished writing:', outfile
         , '\nNo. of rows:', len(dssp_df)
         , '\nNo. of cols:', len(dssp_df.columns)
         , '\n==============================================================')
 #=======================================================================
 #%% call functions
 #dssp_file_from_pdb(infile, dssp_file, DSSP = "dssp")
 #my_chains = extract_chain_dssp(infile)
 #dssp_to_csv(dssp_file, dsspcsv_file, my_chains)
 #%%
 #=======================================================================
 def main():
    print('Running dssp with the following params:\n'
 		, in_filename
 		, 'outfile:', dsspcsv_filename)
    dssp_file_from_pdb(infile, dssp_file, DSSP = "dssp")
    my_chains = extract_chain_dssp(infile)
    dssp_to_csv(dssp_file, dsspcsv_file, my_chains)
 if __name__ == '__main__':
    main()
 #%% end of script
 #=======================================================================
 #=======================================================================
--- a/scripts/find_missense.py
+++ b/scripts/find_missense.py
@ -1,99 +0,0 @@
 #!/usr/bin/env python3
 import pandas as pd
 DEBUG = False
 #%%
 #def find_missense(test_df, ref_allele1, alt_allele0):
 def find_missense(test_df, ref_allele_column, alt_allele_column, n_diff_colname = 'n_diff', tot_diff_colname = 'tot_diff', ref_a_colname = 'ref_allele', alt_a_colname = 'alt_allele'):
    """Find mismatches in pairwise comparison of strings b/w col_a and col_b
    Case insensitive, converts strings to uppercase before comparison
    @test_df: df containing columns to compare
    @type: pandas df
    @ref_allele_column: column containing ref allele str
    @type: str (converts to uppercase)
    @alt_allele_column: column containing alt_allele str
    @type: str (converts to uppercase)
    @n_diff_colname: user defined colname for no. of char diff b/w ref_allele_str and alt_allele_str
    @type: str
    @tot_diff_colname: user defined colname abs diff to indicate if strings are of equal length
    @type: str
    @ref_a_colname: user defined colname containing extracted referece allele
    @type: str
    @alt_a_colname: user defined colname containing extracted alt allele
    @type: str
    returns df: with 4 columns. If column names clash, the function column 
                name will override original column
    @rtype: pandas df
    """
    for ind, val in test_df.iterrows():
        if DEBUG:
            print('index:', ind, 'value:', val
                  , '\n============================================================')
        ref_a = val[ref_allele_column].upper()
        alt_a = val[alt_allele_column].upper()
        if DEBUG:
            print('ref_allele_string:', ref_a, 'alt_allele_string:', alt_a)
        difference = sum(1 for e in zip(ref_a, alt_a) if e[0] != e[1])
        test_df.at[ind, n_diff_colname] = difference # adding column
        tot_difference = difference + abs(len(ref_a) - len(alt_a))
        test_df.at[ind, tot_diff_colname] = tot_difference # adding column
        if difference != tot_difference:
            print('WARNING: lengths of ref_allele and alt_allele differ at index:', ind
                  , '\nNon-missense muts detected')
        # Now finding the mismatched char
        ref_aln = ''
        alt_aln = ''
        if ref_a == alt_a:
            ##test_df.at[ind, 'ref_allele'] = 'no_change' # adding column
            ##test_df.at[ind, 'alt_allele'] = 'no_change' # adding column
            test_df.at[ind, ref_a_colname] = 'no_change' # adding column
            test_df.at[ind, alt_a_colname] = 'no_change' # adding column
        elif len(ref_a) == len(alt_a) and len(ref_a) > 0:
            print('ref:', ref_a, 'alt:', alt_a)
            for n in range(len(ref_a)):
                if ref_a[n] != alt_a[n]:
                    ref_aln += ref_a[n]
                    alt_aln += alt_a[n]
            ##test_df.at[ind, 'ref_allele'] = ref_aln
            ##test_df.at[ind, 'alt_allele'] = alt_aln
            test_df.at[ind, ref_a_colname] = ref_aln
            test_df.at[ind, alt_a_colname] = alt_aln
            print('ref:', ref_aln)
            print('alt:', alt_aln)
        else:
            ##test_df.at[ind, 'ref_allele'] = 'ERROR_Not_nsSNP'
            ##test_df.at[ind, 'alt_allele'] = 'ERROR_Not_nsSNP'
            test_df.at[ind, ref_a_colname] = 'ERROR_Not_nsSNP'
            test_df.at[ind, alt_a_colname] = 'ERROR_Not_nsSNP'
    return test_df
 #========================================
 # a representative example
 eg_df = {'chromosome_number': [2288719, 2288766, 2288775, 2288779, 2288827, 1111111, 2222222],
           'ref_allele1': ['Tc', 'AG', 'AGCACCCTG', 'CCCTGGTGGCC', 'CACA', 'AA', 'CAT'],
           'alt_allele0': ['CC', 'CA', 'GGCACCCTGZ','TCCTGGTGGCCAAD', 'TACA', 'AA', 'TCZ']}
 # snippet of actual data
 #eg_df = pd.read_csv('pnca_assoc.txt', sep = '\t', nrows = 10, header = 0, index_col = False)
 eg_df = pd.DataFrame(eg_df)
 def main():
    #find_missense(eg_df, ref_allele1 = 'ref_allele', alt_allele0 = 'alt_allele')
    find_missense(test_df = eg_df, ref_allele_column = 'ref_allele1', alt_allele_column = 'alt_allele0')
    print(eg_df)
 if __name__ == '__main__':
    main()
--- a/scripts/getopt-example.r
+++ b/scripts/getopt-example.r
@ -1,14 +0,0 @@
 #!/usr/bin/env Rscript
 require('getopt', quietly=TRUE)
 spec = matrix(c(
  "drug"   , "d", 1, "character",
  "gene"   , "g", 1, "character"
 ), byrow=TRUE, ncol=4)
 opt = getopt(spec);
 drug = opt$drug
 gene = opt$gene
 cat(c('\nDrug:', drug, '\nGene:', gene, '\n'))
--- a/scripts/kd_df.py
+++ b/scripts/kd_df.py
@ -1,230 +0,0 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 '''
 Created on Tue Aug  6 12:56:03 2019
@author: tanu
 '''
 #=======================================================================
 # Task: Hydrophobicity (Kd) values for amino acid sequence using the
 # Kyt&-Doolittle.
 # Same output as using the expasy server (link below)
 # Input: fasta file
 # Output: csv file with 
 # useful links
 # https://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParamData-pysrc.html
 # https://web.expasy.org/protscale/pscale/protscale_help.html
 #=======================================================================
 #%% load packages
 import sys, os
 import argparse
 import pandas as pd
 import numpy as np
 from pylab import *
 from Bio.SeqUtils import ProtParamData
 from Bio.SeqUtils.ProtParam import ProteinAnalysis
 from Bio import SeqIO
 #from Bio.Alphabet.IUPAC import IUPACProtein
 import pprint as pp
 #=======================================================================
 #%% specify homedir and curr dir
 homedir = os.path.expanduser('~')
 # set working dir
 os.getcwd()
 os.chdir(homedir + '/git/LSHTM_analysis/scripts')
 os.getcwd()
 #=======================================================================
 #%% command line args
 arg_parser = argparse.ArgumentParser()
 arg_parser.add_argument('-d', '--drug', help='drug name', default = None)
 arg_parser.add_argument('-g', '--gene', help='gene name', default = None)
 #arg_parser.add_argument('-p', '--plot', help='show plot',  action='store_true')
 args = arg_parser.parse_args()
 #=======================================================================
 #%% variable assignment: input and output 
 #drug = 'pyrazinamide'
 #gene = 'pncA'
 drug = args.drug
 gene = args.gene
 #plot = args.plot
 gene_match = gene + '_p.'
 #==========
 # data dir
 #==========
 datadir = homedir + '/' + 'git/Data'
 #=======
 # input
 #=======
 indir = datadir + '/' + drug + '/' + 'input'
 in_filename = '3pl1.fasta.txt'
 infile = indir + '/' + in_filename
 print('Input filename:', in_filename
      , '\nInput path:', indir
      , '\n============================================================')
 #=======
 # output 
 #=======
 outdir =   datadir + '/' + drug + '/' + 'output'
 out_filename = gene.lower() + '_kd.csv'
 outfile =  outdir + '/' + out_filename
 print('Output filename:', out_filename
      , '\nOutput path:', outdir
      , '\n=============================================================')
 #%% end of variable assignment for input and output files
 #=======================================================================
 #%% kd values from fasta file and output csv
 def kd_to_csv(inputfasta, outputkdcsv, windowsize = 3):
    """
    Calculate kd (hydropathy values) from input fasta file
    @param inputfasta: fasta file
    @type inputfasta: string
    @param outputkdcsv: csv file with kd values
    @type outfile: string
    @param windowsize: windowsize to perform KD calcs on (Kyte&-Doolittle)
    @type DSSP: numeric
    @return:  none, writes kd values df as csv
    """
    #========================
    # read input fasta file
    #========================
    fh = open(inputfasta)
    for record in SeqIO.parse(fh, 'fasta'):
        id = record.id
        seq = record.seq
        num_residues = len(seq)
    fh.close()
    sequence = str(seq)
    X = ProteinAnalysis(sequence)
    #===================
    # calculate KD values: same as the expasy server
    #===================
    my_window = windowsize
    offset = round((my_window/2)-0.5)
    # edge weight is set to  default (100%)
    kd_values = (X.protein_scale(ProtParamData.kd , window = my_window))
    # sanity checks 
    print('Sequence Length:', num_residues)
    print('kd_values Length:',len(kd_values))
    print('Window Length:', my_window)
    print('Window Offset:', offset)
    print('=================================================================')
    print('Checking:len(kd values) is as expected for the given window size & offset...')
    expected_length =  num_residues - (my_window - offset) 
    if len(kd_values) == expected_length:
        print('PASS: expected and actual length of kd values match')
    else:
        print('FAIL: length mismatch'
              ,'\nExpected length:', expected_length
              ,'\nActual length:', len(kd_values)
              , '\n=========================================================')
    #===================
    # creating two dfs
    #===================
    # 1) aa sequence and 2) kd_values. Then reset index for each df 
    # which will allow easy merging of the two dfs.
    # df1: df of aa seq with index reset to start from 1 
    # (reflective of the actual aa position in a sequence)
    # Name column of wt as 'wild_type' to be the same name used 
    # in the file required for merging later.
    dfSeq = pd.DataFrame({'wild_type_kd':list(sequence)})
    dfSeq.index = np.arange(1, len(dfSeq) + 1) # python is not inclusive
    # df2: df of kd_values with index reset to start from offset + 1 and 
    # subsequent matched length of the kd_values
    dfVals = pd.DataFrame({'kd_values':kd_values})
    dfVals.index = np.arange(offset + 1, len(dfVals) + 1 + offset)
    # sanity checks
    max(dfVals['kd_values'])
    min(dfVals['kd_values'])
    #===================
    # concatenating dfs
    #===================
    # Merge the two on index 
    # (as these are now reflective of the aa position numbers): df1 and df2 
    # This will introduce NaN where there is missing values. In our case this
    # will be 2 (first and last ones based on window size and offset)
    kd_df = pd.concat([dfSeq, dfVals], axis = 1)
    #============================
    # renaming index to position
    #============================
    kd_df = kd_df.rename_axis('position')
    kd_df.head
    print('Checking: position col i.e. index should be numeric')
    if kd_df.index.dtype == 'int64':
        print('PASS: position col is numeric'
              , '\ndtype is:', kd_df.index.dtype)
    else:
        print('FAIL: position col is not numeric'
              , '\nConverting to numeric')
        kd_df.index.astype('int64')
        print('Checking dtype for after conversion:\n'
              , '\ndtype is:', kd_df.index.dtype
              , '\n=========================================================')
    #===============
    # writing file
    #===============
    print('Writing file:'
          , '\nFilename:', outputkdcsv
 #          , '\nPath:',  outdir
          , '\nExpected no. of rows:', len(kd_df)
          , '\nExpected no. of cols:', len(kd_df.columns)
          , '\n=============================================================')
    kd_df.to_csv(outputkdcsv, header = True, index = True)
    #===============
    # plot: optional!
    #===============
    # http://www.dalkescientific.com/writings/NBN/plotting.html
    # FIXME: save fig
    # extract just pdb if from 'id' to pass to title of plot
    # foo = re.match(r'(^[0-9]{1}\w{3})', id).groups(1)
 #    if doplot:
    plot(kd_values, linewidth = 1.0)
    #axis(xmin = 1, xmax = num_residues)
    xlabel('Residue Number')
    ylabel('Hydrophobicity')
    title('K&D Hydrophobicity for ' + id)
    show()
 #%% end of function
 #=======================================================================
 #%% call function
 #kd_to_csv(infile, outfile, windowsize = 3)
 #=======================================================================
 def main():
    print('Running hydropathy calcs with following params\n'
        , in_filename
        , '\noutfile:', out_filename)
    kd_to_csv(infile, outfile, 3)
    print('Finished writing file:'
        , '\nFilename:', outfile
        , '\n=============================================================')
 if __name__ == '__main__':
    main()
 #%% end of script    
 #=======================================================================
--- a/scripts/nssnp_info_format.py
+++ b/scripts/nssnp_info_format.py
@ -1,152 +0,0 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Wed Jun 10 11:13:49 2020
@author: tanu
 """
 #==============================================================================
 # TASK
 # To format snp_fino.txt file that has already been processed in bash 
 # to include mcsm style muts and gwas style muts. The idea is that the info file 
 # will contain all possible mutation format style to make it easy to populate 
 # and link other files with this sort of meta data. For example: or file
 #=======================================================================
 # FIXME : add bash info here as well
 #%% useful links
 #https://chrisalbon.com/python/data_wrangling/pandas_join_merge_dataframe/
 #https://kanoki.org/2019/11/12/how-to-use-regex-in-pandas/
 #https://stackoverflow.com/questions/40348541/pandas-diff-with-string
 #=======================================================================
 #%% specify dirs
 import os, sys
 import pandas as pd
 import numpy as np
 import re
 import argparse
 homedir = os.path.expanduser('~')
 os.chdir(homedir + '/git/LSHTM_analysis/scripts')
 #from reference_dict import my_aa_dict
 #from reference_dict import low_3letter_dict # equivalent of my_aa_dict
 from reference_dict import oneletter_aa_dict
 #=======================================================================
 #%% command line args
 arg_parser = argparse.ArgumentParser()
 arg_parser.add_argument('-d', '--drug', help = 'drug name', default = 'pyrazinamide')
 arg_parser.add_argument('-g', '--gene', help = 'gene name (case sensitive)', default = 'pncA') # case sensitive
 args = arg_parser.parse_args()
 #=======================================================================
 #%% variables
 #gene = 'pncA'
 #drug = 'pyrazinamide'
 #gene_match = gene +'_p.'
 # cmd variables
 gene = args.gene
 drug = args.drug
 gene_match = gene +'_p.'
 #=======================================================================
 #%% input and output dirs and files
 #=======
 # data dir
 #=======
 datadir = homedir + '/' + 'git/Data'
 indir = datadir + '/' + drug + '/input'
 outdir = datadir + '/' + drug + '/output'
 #=======
 # input
 #=======
 gene_info_filename  = 'ns'+ gene.lower()+ '_snp_info1.txt'
 gene_info = indir + '/' + gene_info_filename
 print('gene info file: ', gene_info
      , '\n============================================================')
 #=======
 # output
 #=======
 gene_snp_info_filename = 'ns' + gene.lower() + '_snp_info.csv' # other one is called AFandOR
 outfile_snp_info =  indir + '/' + gene_snp_info_filename
 print('Output file: ', outfile_snp_info
      , '\n============================================================')
 #%% read files: preformatted using bash
 info_df2  = pd.read_csv(gene_info, sep = '\t', header = 0) #303, 10   
 #%% Split into three cols with 1-letter aa_code & combine to get mutationinformation column
 # check mutation format in exisiting df
 info_df2.head()
 info_df2['mut_info'].head()
 print('Creating column: mutationinformation')
 info_df2_ncols = len(info_df2.columns)
 info_df2['wild_type'] = info_df2['mut_info'].str.extract('(\w{1})>')
 info_df2['position'] = info_df2['mut_info'].str.extract('(\d+)') 
 info_df2['mutant_type'] = info_df2['mut_info'].str.extract('>\d+(\w{1})')   
 info_df2['mutationinformation'] = info_df2['wild_type'] + info_df2['position'] + info_df2['mutant_type']    
 # sanity check
 ncols_add = 4 # Beware hardcoded
 if len(info_df2.columns) == info_df2_ncols + ncols_add:
    print('PASS: Succesfully extracted and added mutationinformation (mcsm style) as below\n'
          , info_df2['mutationinformation'].head()
          , '\n=====================================================================')
 else:
    print('FAIL: No. of cols mismatch'
          ,'\noriginal length:', info_df2_ncols
          , '\nExpected no. of cols:', info_df2_ncols + ncols_add
          , '\nGot no. of cols:', len(info_df2.columns))
    sys.exit()
 # update ncols    
 info_df2_ncols = len(info_df2.columns)
 #%%  Creating column 'mutation' which as mutation of the format;
 # <gene_match>.lower()<abc>1<cde>: pnca_p.trp68gly
 # match the 'one_letter_code' value to get the dict key (three-letter code)
 print('Creating column: mutation')
 # dict to use: oneletter_aa_dict
 lookup_dict = dict()
 for k1, v1 in oneletter_aa_dict.items():
    lookup_dict[k1] = v1['three_letter_code_lower']
 for k2, v2 in lookup_dict.items():
    info_df2['wt_3let'] = info_df2['wild_type'].squeeze().map(lookup_dict)
    info_df2['mt_3let'] = info_df2['mutant_type'].squeeze().map(lookup_dict)
 # create column mutation    
 info_df2['mutation'] = info_df2['wt_3let'] + info_df2['position'] + info_df2['mt_3let']    
 # add prefix: gene_match to each value in column
 info_df2['mutation'] = gene_match.lower() + info_df2['mutation'].astype(str)
 # sanity check
 ncols_add = 3 # Beware hardcoded
 if len(info_df2.columns) == info_df2_ncols + ncols_add:
    print('PASS: Succesfully created column mutation as below\n'
          , info_df2['mutation'].head()
          , '\n=====================================================================')
 else:
    print('FAIL: No. of cols mismatch\noriginal length:', info_df2_ncols
          , '\nExpected no. of cols:', info_df2_ncols + ncols_add
          , '\nGot no. of cols:', len(info_df2.columns))
    sys.exit()
 #%% write file
 print('\n====================================================================='
      , '\nWriting output file:\n', outfile_snp_info
      , '\nNo.of rows:', len(info_df2)
      , '\nNo. of cols:', len(info_df2.columns))
 info_df2.to_csv(outfile_snp_info, index = False)
--- a/scripts/or_kinship_link.py
+++ b/scripts/or_kinship_link.py
@ -1,334 +0,0 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Wed Jun 10 11:13:49 2020
@author: tanu
 """
 #=======================================================================
 #%% useful links
 #https://chrisalbon.com/python/data_wrangling/pandas_join_merge_dataframe/
 #https://kanoki.org/2019/11/12/how-to-use-regex-in-pandas/
 #https://stackoverflow.com/questions/40348541/pandas-diff-with-string
 #=======================================================================
 #%% specify dirs
 import os, sys
 import pandas as pd
 import numpy as np
 import re
 import argparse
 homedir = os.path.expanduser('~')
 os.chdir(homedir + '/git/LSHTM_analysis/scripts')
 # local import
 from find_missense import find_missense
 #=======================================================================
 #%% command line args
 arg_parser = argparse.ArgumentParser()
 arg_parser.add_argument('-d', '--drug', help = 'drug name', default = 'pyrazinamide')
 arg_parser.add_argument('-g', '--gene', help = 'gene name (case sensitive)', default = 'pncA') # case sensitive
 arg_parser.add_argument('-s', '--start_coord', help = 'start of coding region (cds) of gene', default = 2288681) # pnca cds
 arg_parser.add_argument('-e', '--end_coord', help = 'end of coding region (cds) of gene', default = 2289241) # pnca cds
 args = arg_parser.parse_args()
 #=======================================================================
 #%% variables
 #gene = 'pncA'
 #drug = 'pyrazinamide'
 #start_cds = 2288681
 #end_cds = 2289241
 # cmd variables
 gene = args.gene
 drug = args.drug
 start_cds = args.start_coord
 end_cds = args.end_coord
 #=======================================================================
 #%% input and output dirs and files
 #=======
 # data dir
 #=======
 datadir = homedir + '/' + 'git/Data'
 indir = datadir + '/' + drug + '/input'
 outdir = datadir + '/' + drug + '/output'
 #=======
 # input
 #=======
 info_filename  = 'snp_info.txt'
 snp_info = datadir + '/' + info_filename
 print('Info file: ', snp_info
      , '\n============================================================')
 gene_info_filename  = 'ns'+ gene.lower()+ '_snp_info.txt'
 gene_info = indir + '/' + gene_info_filename
 print('gene info file: ', gene_info
      , '\n============================================================')
 in_filename_or  = 'ns'+ gene.lower()+ '_assoc.txt'
 gene_or = indir + '/' + in_filename_or
 print('gene OR file: ', gene_or
      , '\n============================================================')
 #=======
 # output
 #=======
 gene_or_filename = gene.lower() + '_af_or_kinship.csv' # other one is called AFandOR
 outfile_or_kin =  outdir + '/' + gene_or_filename
 print('Output file: ', outfile_or_kin
      , '\n============================================================')
 #%% read files: preformatted using bash
 # or file: '...assoc.txt'
 or_df = pd.read_csv(gene_or, sep = '\t', header = 0, index_col = False) # 182, 12 (without filtering for missense muts, it was 212 i.e we 30 muts weren't missense)
 or_df.head()
 or_df.columns
 #%% snp_info file: master and gene specific ones
 # gene info
 #info_df2  = pd.read_csv('nssnp_info_pnca.txt', sep = '\t', header = 0) #303, 10 
 info_df2  = pd.read_csv(gene_info, sep = '\t', header = 0) #303, 10   
 mis_mut_cover = (info_df2['chromosome_number'].nunique()/info_df2['chromosome_number'].count()) * 100
 print('*****RESULT*****'
      , '\nPercentage of MISsense mut in pncA:', mis_mut_cover
      , '\n*****RESULT*****') #65.7%
 # large file
 #info_df  = pd.read_csv('snp_info.txt', sep = '\t', header = None) #12010
 info_df  = pd.read_csv(snp_info, sep = '\t') #12010
 #info_df.columns = ['chromosome_number', 'ref_allele', 'alt_allele', 'snp_info'] #12009, 4
 info_df['chromosome_number'].nunique() #10257
 mut_cover = (info_df['chromosome_number'].nunique()/info_df['chromosome_number'].count()) * 100
 print('*****RESULT*****'
      ,'\nPercentage of mutations in pncA:', mut_cover
      , '\n*****RESULT*****') #85.4%
 # extract unique chr position numbers
 genomic_pos = info_df['chromosome_number'].unique()
 genomic_pos_df = pd.DataFrame(genomic_pos, columns = ['chr_pos'])
 genomic_pos_df.dtypes
 genomic_pos_min = info_df['chromosome_number'].min()
 genomic_pos_max = info_df['chromosome_number'].max()
 # genomic coord for pnca coding region
 cds_len = (end_cds-start_cds) + 1
 pred_prot_len = (cds_len/3) - 1
 # mindblowing: difference b/w bitwise (&) and 'and'
 # DO NOT want &: is this bit set to '1' in both variables? Is this what you want?
 #if (genomic_pos_min <= start_cds) & (genomic_pos_max >= end_cds):
 print('*****RESULT*****'
      , '\nlength of coding region:', cds_len, 'bp'
      , '\npredicted protein length:', pred_prot_len, 'aa'
      , '\n*****RESULT*****') 
 if genomic_pos_min <= start_cds and genomic_pos_max >= end_cds:    
    print ('PASS: coding region for gene included in snp_info.txt')
 else:
    print('FAIL: coding region for gene not included in info file snp_info.txt')
    sys.exit('ERROR: coding region of gene not included in the info file')
 #%%  Extracting ref allele and alt allele as single letters
 # info_df has some of these params as more than a single letter, which means that 
 # when you try to merge ONLY using chromosome_number, then it messes up... and is WRONG.
 # Hence the  merge needs to be performed on a unique set of attributes which in our case
 # would be chromosome_number, ref_allele and alt_allele
 #FIXME: Turn to a function
 orig_len = len(or_df.columns)
 #find_missense(or_df, 'ref_allele1', 'alt_allele0')
 find_missense(or_df, ref_allele_column = 'ref_allele1', alt_allele_column = 'alt_allele0')
 ncols_add = 4
 if len(or_df.columns) == orig_len + ncols_add:
    print('PASS: Succesfuly extracted ref and alt alleles for missense muts')
 else:
    print('FAIL: No. of cols mismatch'
          ,'\noriginal length:', orig_len
          , '\nExpected no. of cols:', orig_len + ncols_add
          , '\nGot no. of cols:', len(or_df.columns))
    sys.exit()
 del(orig_len, ncols_add)
 #%% TRY MERGE
 # check dtypes
 or_df.dtypes
 info_df.dtypes
 #or_df.info()
 # pandas documentation where it mentions: "Pandas uses the object dtype for storing strings" 
 # check how many unique chr_num in info_df are in or_df
 genomic_pos_df['chr_pos'].isin(or_df['chromosome_number']).sum() #144
 # check how many chr_num in or_df are in info_df: should be ALL of them
 or_df['chromosome_number'].isin(genomic_pos_df['chr_pos']).sum() #182
 # sanity check 2
 if or_df['chromosome_number'].isin(genomic_pos_df['chr_pos']).sum() == len(or_df):
    print('PASS: all genomic locs in or_df have meta datain info.txt')
 else:
    sys.exit('FAIL: some genomic locs or_df chr number DO NOT have meta data in snp_info.txt')
 #%% Perform merge
 #my_join = 'inner'
 #my_join = 'outer'
 my_join = 'left'
 #my_join = 'right'
 #dfm1 = pd.merge(or_df, info_df, on ='chromosome_number', how = my_join, indicator = True) # not unique!
 dfm1 = pd.merge(or_df, info_df, on = ['chromosome_number', 'ref_allele', 'alt_allele'], how = my_join, indicator = True)
 dfm1['_merge'].value_counts()
 # count no. of missense mutations ONLY
 dfm1.snp_info.str.count(r'(missense.*)').sum()
 dfm2 = pd.merge(or_df, info_df2, on = ['chromosome_number', 'ref_allele', 'alt_allele'], how = my_join, indicator = True) 
 dfm2['_merge'].value_counts()
 # count no. of nan
 dfm2['mut_type'].isna().sum()
 # drop nan
 dfm2_mis = dfm2[dfm2['mut_type'].notnull()]
 #%% sanity check
 # count no. of missense muts
 if len(dfm1) - dfm1.snp_info.str.count(r'(missense.*)').sum() == dfm2['mut_type'].isna().sum():
    print('PASSED: numbers cross checked'
          , '\nTotal no. of missense mutations:', dfm1.snp_info.str.count(r'(missense.*)').sum()   
          , '\nNo. of mutations falsely assumed to be missense:', len(dfm1) - dfm1.snp_info.str.count(r'(missense.*)').sum())
 # two ways to filter to get only  missense muts
 test = dfm1[dfm1['snp_info'].str.count('missense.*')>0]
 dfm1_mis = dfm1[dfm1['snp_info'].str.match('(missense.*)') == True]
 test.equals(dfm1_mis)
 # drop nan
 dfm2_mis = dfm2[dfm2['mut_type'].notnull()]
 if dfm1_mis[['chromosome_number', 'ref_allele', 'alt_allele']].equals(dfm2_mis[['chromosome_number', 'ref_allele', 'alt_allele']]):
    print('PASS: Further cross checks successful')
 else:
    print('FAIL: Second cross check unsuccessfull. Debug please!')
    sys.exit()
 #%% extract mut info into three cols
 orig_len = len(dfm2_mis.columns)
 dfm2_mis['wild_type'] = dfm2_mis['mut_info'].str.extract('(\w{1})>')
 dfm2_mis['position'] = dfm2_mis['mut_info'].str.extract('(\d+)') 
 dfm2_mis['mutant_type'] = dfm2_mis['mut_info'].str.extract('>\d+(\w{1})')   
 dfm2_mis['mutationinformation'] = dfm2_mis['wild_type'] + dfm2_mis['position'] + dfm2_mis['mutant_type']    
 # sanity check
 ncols_add = 4
 if len(dfm2_mis.columns) == orig_len + ncols_add:
    print('PASS: Succesfully extracted and added mutationinformation(mcsm style)')
 else:
    print('FAIL: No. of cols mismatch'
          ,'\noriginal length:', orig_len
          , '\nExpected no. of cols:', orig_len + ncols_add
          , '\nGot no. of cols:', len(dfm2_mis.columns))
    sys.exit()
 #%% formatting data for output
 print('no of cols preformatting data:', len(dfm2_mis.columns))
 #1) Add column: OR for kinship calculated from beta coeff
 print('converting beta coeff to OR by exponent function\n:'
      , dfm2_mis['beta'].head())
 dfm2_mis['or_kin'] = np.exp(dfm2_mis['beta'])
 print(dfm2_mis['or_kin'].head())
 #2) rename af column
 dfm2_mis.rename(columns = {'af': 'af_kin'
                           , 'beta': 'beta_kin'
                           , 'p_wald': 'pwald_kin'
                           , 'se': 'se_kin', 'logl_H1': 'logl_H1_kin'
                           , 'l_remle': 'l_remle_kin'}, inplace = True)
 #3)  drop some not required cols (including duplicate if you want)
 #3a) drop duplicate columns
 dfm2_mis2 = dfm2_mis.T.drop_duplicates().T #changes dtypes in cols, so not used
 dup_cols = set(dfm2_mis.columns).difference(dfm2_mis2.columns)
 print('Duplicate columns identified:', dup_cols)
 dup_cols = {'alt_allele0', 'ps'} # didn't want to remove tot_diff
 print('removing duplicate columns: kept one of the dup_cols i.e tot_diff')
 dfm2_mis.drop(list(dup_cols), axis = 1, inplace = True)
 print(dfm2_mis.columns)
 #3b) other not useful columns
 dfm2_mis.drop(['chromosome_text', 'chr', 'symbol', '_merge', ], axis = 1, inplace = True)
 dfm2_mis.rename(columns = {'ref_allele1': 'reference_allele'}, inplace = True)
 print(dfm2_mis.columns)
 #4) reorder columns
 orkin_linked = dfm2_mis[['mutationinformation', 
 'wild_type', 
 'position',
 'mutant_type',
 'chr_num_allele',
 'ref_allele',
 'alt_allele', 
 'mut_info', 
 'mut_type', 
 'gene_id', 
 'gene_number', 
 'mut_region', 
 'reference_allele', 
 'alternate_allele', 
 'chromosome_number',
 #'afs
 'af_kin',
 'or_kin',
 # 'ors_logistic', 
 # 'ors_chi_cus', 
 # 'ors_fisher',
 'pwald_kin', 
 # 'pvals_logistic', 
 # 'pvals_fisher',
 # 'ci_lb_fisher', 
 # 'ci_ub_fisher' ,
 'beta_kin',
 'se_kin', 
 'logl_H1_kin',
 'l_remle_kin',
 # 'stat_chi', 
 # 'pvals_chi', 
 'n_diff',
 'tot_diff',
 'n_miss']]
 # sanity check after reassigning columns
 if orkin_linked.shape == dfm2_mis.shape and set(orkin_linked.columns) == set(dfm2_mis.columns):
    print('PASS: Successfully formatted df with rearranged columns')
 else:
    sys.exit('FAIL: something went wrong when rearranging columns!')
 #%% write file
 print('\n====================================================================='
      , '\nWriting output file:\n', outfile_or_kin
      , '\nNo.of rows:', len(dfm2_mis)
      , '\nNo. of cols:', len(dfm2_mis.columns))
 orkin_linked.to_csv(outfile_or_kin, index = False)
 #%% diff b/w allele0 and 1: or_df
 #https://stackoverflow.com/questions/40348541/pandas-diff-with-string
 #df = or_df.iloc[[5, 15, 17, 19, 34]]
 #df[['alt_allele0','ref_allele1']].ne(df[['alt_allele0','ref_allele1']].shift()).any(axis=1).astype(int) 
 #df[['alt_allele0','ref_allele1']].ne(df[['alt_allele0','ref_allele1']].shift()).any(axis=1).astype(int)
--- a/scripts/pdb_align.py
+++ b/scripts/pdb_align.py
@ -1,34 +0,0 @@
 #!/usr/bin/env python3
 from Bio import SeqIO
 from Bio import pairwise2
 from Bio.pairwise2 import format_alignment
 from align import myalign
 import re
 import os
 os.chdir('/home/tanu/git/LSHTM_analysis/scripts/examples')
 def main():
    """
    align ref_seq and pdb_seq
    # FIXME: pass command line args i.e filename
    """
    my_dict = {}
    align_fastas_to_align = open('align_fastas.txt', 'r') 
    for record in SeqIO.parse(align_fastas_to_align,"fasta"):
         myid = record.id
         seq = str(record.seq)
         my_dict.update({myid : seq})
    my_keys = list(my_dict.keys())
    my_ref_seq = my_dict[my_keys[0]]
    my_pdb_seq = my_dict[my_keys[1]]
    fasta_alignment = myalign(my_ref_seq, my_pdb_seq)
    print(fasta_alignment)
    print('class:', type(fasta_alignment))
 if __name__ == '__main__':
    main()
--- a/scripts/pdb_chain_extract.py
+++ b/scripts/pdb_chain_extract.py
@ -1,70 +0,0 @@
 #!/usr/bin/env python3
 # Copyright 2020, Tanushree Tunstall
 # This program is distributed under General Public License v. 3.  See the file
 # COPYING for a copy of the license.
 __description__ = \
 """
 chain_extract.py
 extract chain/s from pdb and saves each chain as a separate file
 """
 __author__ = "Tanushree Tunstall"
 __date__ = ""
 #=======================================================================
 import os, shutil, sys
 #from pdbtools.helper import cmdline
 from chain_extract import ChainExtract
 from Bio.PDB import PDBParser, PDBIO, Select
 #from Bio.PDB.PDBParser import PDBParser
 #io = PDBIO()
 import argparse
 #=======================================================================
 def main():
    """
    Function to call if run from command line.
    Example use:
    pdb_chain_extract.py -f <your_pdb_file> -c <chainid1><chainid2> -p <outpath> -o <outfile>
    Extracts chain 'A' by default.
    """
    arg_parser = argparse.ArgumentParser()
    arg_parser.add_argument('-i', '--pdb_file', help='provide pdb file', default = 'None')
    arg_parser.add_argument('-c', '--chain', help='chain/s to extract without spaces.', nargs = '+',  default = 'A', type = list)
    arg_parser.add_argument('-p', '--out_path', help='specify output path',  default = '.', type = str)
    arg_parser.add_argument('-o', '--out_file', help='specify output filename. Will be used as a prefix to append chain id and pdb file extension',  default = 'pdbfile', type = str)
    args = arg_parser.parse_args()
    # Extract chains and write each chain as a separate file
    pdb_file = args.pdb_file
    print('input pdb file:', pdb_file)
    # type = list, makes it a list of lists. Hence extracting the list of chains.
    chains = args.chain[0]
    #chains = ['A','B','C']
    print ('user supplied chain:', chains)
    # output filename and path
    outpath = args.out_path
    outfile = args.out_file
    # get structure
    p = PDBParser(PERMISSIVE=1)       
    structure = p.get_structure(pdb_file, pdb_file)
    print('input pdb filename:', structure.get_id())
    my_chains = chains
    #my_chains = ['G', 'H'] 
    c_names = ''.join(my_chains)
    print('Extracting chains:', my_chains)
    pdb_chains_file = outpath + '/' + outfile + '_' + c_names + '.pdb'
    io = PDBIO()
    io.set_structure(structure)
    io.save(pdb_chains_file, ChainExtract(my_chains))
 if __name__ == "__main__":
    main()
--- a/scripts/pdb_chain_splitter.py
+++ b/scripts/pdb_chain_splitter.py
@ -1,71 +0,0 @@
 #!/usr/bin/env python3
 # Copyright 2020, Tanushree Tunstall
 # This program is distributed under General Public License v. 3.  See the file
 # COPYING for a copy of the license.
 __description__ = \
 """
 chain_splitter.py
 extract chain/s from pdb and saves each chain as a separate file
 """
 __author__ = "Tanushree Tunstall"
 __date__ = ""
 #=======================================================================
 import os, shutil, sys
 #from pdbtools.helper import cmdline
 from chain_splitter import ChainSelect
 from Bio.PDB import Select, PDBIO
 from Bio.PDB.PDBParser import PDBParser
 #io = PDBIO()
 import argparse
 #=======================================================================
 def main():
    """
    Function to call if run from command line.
    Example use:
    pdb_chain_splitter.py -f <your_pdb_file> -c <chainid1><chainid2>
    Extracts chain 'A' by default.
    FIXME: extract all chains from the given pdb and write them out individually
    """
    arg_parser = argparse.ArgumentParser()
    arg_parser.add_argument('-i', '--pdb_file', help='provide pdb file', default = 'None')
    arg_parser.add_argument('-c', '--chain', help='chain/s to extract without spaces.', nargs = '+',  default = 'A', type = list)
    arg_parser.add_argument('-p', '--out_path', help='specify output path',  default = '.', type = str)
    arg_parser.add_argument('-o', '--out_file', help='specify output filename. Will be used as a prefix to append chain id and pdb file extension',  default = 'pdb_file_chain', type = str)
    args = arg_parser.parse_args()
    # Extract chains and write each chain as a separate file
    pdb_file = args.pdb_file
    print('input pdb file:', pdb_file)
    # type = list, makes it a list of lists. Hence extracting the list of chains.
    chains = args.chain[0]
    #chains = ['A','B','C']
    print ('user supplied chain:', chains)
    # output filename and path
    outpath = args.out_path
    outfile = args.out_file
    # get structure
    p = PDBParser(PERMISSIVE=1)       
    structure = p.get_structure(pdb_file, pdb_file)
    print('input pdb filename:', structure.get_id())
    for chain in chains:
        chain = chain.upper()
        print ('Extracting chain:', chain)
        pdb_chain_file = outpath + '/' + outfile + '_{}.pdb'.format(chain)
        io = PDBIO()               
        io.set_structure(structure)
        io.save('{}'.format(pdb_chain_file), ChainSelect(chain))
 if __name__ == "__main__":
    main()
--- a/scripts/pdbtools
+++ b/scripts/pdbtools
@ -1 +0,0 @@
 Subproject commit 881ff8f27aaf1db4266a84fb03baad3dab552c64
--- a/scripts/pdbtools_commands
+++ b/scripts/pdbtools_commands
@ -1,70 +0,0 @@
 #======================================================
 # renumber pdb file based on user defined start number
 #======================================================
 home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_residue_renumber /home/tanu/git/Data/cycloserine/input/alr_complex_model.pdb -s 35 -r 
 /home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_residue_renumber /home/tanu/git/Data/rifampicin/input/rpob_complex_model.pdb -s 29 -r 
 #======================================================
 # pdb_seq.py: extract seq from structure
 #======================================================
 /home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_seq -a /home/tanu/git/Data/ethambutol/input/3byw.pdb > 3byw_seq.txt
 #/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_seq -c A -a /home/tanu/git/Data/ethambutol/input/3byw.pdb > 3byw_seq.txt
 ======
 # gidB
 =======
 /home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_seq -a /home/tanu/git/LSHTM_3TB/gid/docking/3g89.pdb > 3g89_seq.txt
 /home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_seq -a /home/tanu/git/LSHTM_3TB/gid/docking/gidb_chopin1.pdb > gidb_chopin1_seq.txt
 alignment
 >3g89A_ATOM  chain_length:238
 MFGKHPGGLSERGRALLLEGGKALGLDLKPHLEAFSRLYALLQEAGEEEVVVKHFLDSLTLLRLPLWQGPLRVLDLGTGA
 GFPGLPLKIVRPELELVLVDATRKKVAFVERAIEVLGLKGARALWGRAEVLAREAGHREAYARAVARAVAPLCVLSELLL
 PFLEVGGAAVAMKGPRVEEELAPLPPALERLGGRLGEVLALQLPLSGEARHLVVLEKTAPTPPAYPRRPGVPERHPLC
 >gidb_chopin1 _ATOM  chain_length:224
 MSPIEPAASAIFGPRLGLARRYAEALAGPGVERGLVGPREVGRLWDRHLLNCAVIGELLERGDRVVDIGSGAGLPGVPLA
 IARPDLQVVLLEPLLRRTESLREMVTDLGVAVEIVRGRAEESWVQDQLGGSDAAVSRAVAALDKLTKWSMPLIRPNGRML
 AIKGERAHDEVREHRRVMIASGAVDVRVVTCGANYLRPPATVVFARRGKQIARGSARMASGGTA
 #======================================================
 # pdb_mutator.py: mutate residue: FIXME, needs charm
 #======================================================
 /home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_mutator -r 39 -m XXX /home/tanu/git/Data/ethambutol/input/3byw.pdb
 #======================================================
 # pdb_ligand.py: list ligands within pdb
 # note: works ONLY for pdb containing ligands
 # this is because such pdbs contain a field 'HETATM    ' 
 #======================================================
 /home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_ligand /home/tanu/git/Data/ethambutol/input/7bvf_b.pdb
 /home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_ligand /home/tanu/git/Data/ethambutol/input/7bvf.pdb
 #======================================================
 # pdb_hetatm.py: list ligands for valid pdbs AND docked complexes (my use case)
 #======================================================
 /home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_ligand_tt /home/tanu/git/Data/cycloserine/input/alr_complex.pdb
 /home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_ligand_tt /home/tanu/git/Data/pyrazinamide/input/pnca_complex.pdb 
 /home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_ligand_tt /home/tanu/git/Data/ethambutol/input/7bvf_b.pdb
 /home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_ligand_tt /home/tanu/git/Data/ethambutol/input/7bvf.pdb
 /home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_ligand_tt /home/tanu/git/Data/rifampicin/input/rpob_complex.pdb
 #======================================================
 # get torsion angles
 #======================================================
 /home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_torsion /home/tanu/git/Data/rifampicin/input/rpob_complex.pdb > /home/tanu/git/Data/rifampicin/input/rpob_torsion.txt
 #^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 # my pdb tools
 #======================================================
 # save specifed chains as individual pdbs
 #======================================================
 ./pdb_chain_splitter.py -i /home/tanu/git/Data/ethambutol/input/3byw.pdb -c DF -p /home/tanu/git/Data/ethambutol/input -o 3byw
 #======================================================
 # save specifed chains as one pdb
 #======================================================
 ./pdb_chain_extract.py -i /home/tanu/git/Data/ethambutol/input/3byw.pdb -c DF -p /home/tanu/git/Data/ethambutol/input -o 3byw^C
--- a/scripts/pipeline.ini
+++ b/scripts/pipeline.ini
@ -1,12 +0,0 @@
 [extractor]
 data_dir = /home/tanu/git/Data
 #master_file = original_tanushree_data_v2.csv
 master_file = mtb_gwas_v3.csv
 # Relative Paths. Per-drug paths will be created like:
 #
 # /home/tanu/git/Data/<drug name>/input
 # /home/tanu/git/Data/<drug name>/output
 input_dir = input
 output_dir = output
--- a/scripts/rd_df.py
+++ b/scripts/rd_df.py
@ -1,172 +0,0 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 '''
 Created on Tue Aug  6 12:56:03 2019
@author: tanu
 '''
 #=============================================================================
 # Task: Residue depth (rd) processing to generate a df with residue_depth(rd)
 # values
 # FIXME
 # Input: '.tsv' i.e residue depth txt file (output from .zip file manually
 # downloaded from the website). 
 # This should be integrated into the pipeline
 # Output: .csv with 3 cols i.e position, rd_values & 3-letter wt aa code(caps)
 #=============================================================================
 #%% load packages
 import sys, os
 import argparse
 import pandas as pd
 #=============================================================================
 #%% specify input and curr dir
 homedir = os.path.expanduser('~')
 # set working dir
 os.getcwd()
 os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
 os.getcwd()
 #=======================================================================
 #%% command line args
 arg_parser = argparse.ArgumentParser()
 arg_parser.add_argument('-d', '--drug', help='drug name', default = None)
 arg_parser.add_argument('-g', '--gene', help='gene name', default = None) # case sensitive
 args = arg_parser.parse_args()
 #=======================================================================
 #%% variable assignment: input and output 
 #drug = 'pyrazinamide'
 #gene = 'pncA'
 drug = args.drug
 gene = args.gene
 gene_match = gene + '_p.'
 #==========
 # data dir
 #==========
 datadir = homedir + '/' + 'git/Data'
 #=======
 # input
 #=======
 outdir = datadir + '/' + drug + '/' + 'output'
 in_filename = '3pl1_rd.tsv'
 infile = outdir + '/' + in_filename
 print('Input filename:', in_filename
      , '\nInput path:', outdir
      , '\n=============================================================')
 #=======
 # output 
 #=======
 outdir =   datadir + '/' + drug + '/' + 'output'
 out_filename = gene.lower() + '_rd.csv'
 outfile =  outdir + '/' + out_filename
 print('Output filename:', out_filename
      , '\nOutput path:', outdir
      , '\n=============================================================')
 #%% end of variable assignment for input and output files
 #=======================================================================
 #%% rd values from <gene>_rd.tsv values
 def rd_to_csv(inputtsv, outputrdcsv):
    """
    Calculate kd (hydropathy values) from input fasta file
    @param inputtsv: tsv file downloaded from {INSERT LINK}
    @type inputtsv: string
    @param outputrdsv: csv file with rd values
    @type outfile: string
    @return:  none, writes rd values df as csv
    """
    #========================
    # read downloaded tsv file
    #========================
    #%%  Read input file
    rd_data  = pd.read_csv(inputtsv, sep = '\t')  
    print('Reading input file:', inputtsv
          , '\nNo. of rows:', len(rd_data)
          , '\nNo. of cols:', len(rd_data.columns))
    print('Column names:', rd_data.columns
        , '\n===============================================================')
    #========================
    # creating position col
    #========================
    # Extracting residue number from index and assigning 
    # the values to a column [position]. Then convert the position col to numeric.
    rd_data['position'] = rd_data.index.str.extract('([0-9]+)').values
    # converting position to numeric      
    rd_data['position'] = pd.to_numeric(rd_data['position'])
    rd_data['position'].dtype
    print('Extracted residue num from index and assigned as a column:'
          , '\ncolumn name: position'
          , '\ntotal no. of cols now:', len(rd_data.columns)
          , '\n=============================================================')
    #========================
    # Renaming amino-acid 
    # and all-atom cols
    #========================
    print('Renaming columns:'
          , '\ncolname==> # chain:residue: wt_3letter_caps'
          , '\nYES... the column name *actually* contains a # ..!'
          , '\ncolname==> all-atom: rd_values'
          , '\n=============================================================')
    rd_data.rename(columns = {'# chain:residue':'wt_3letter_caps', 'all-atom':'rd_values'}, inplace = True)
    print('Column names:', rd_data.columns)
    #========================
    # extracting df with the
    # desired columns
    #========================
    print('Extracting relevant columns for writing df as csv')
    rd_df = rd_data[['position','rd_values','wt_3letter_caps']]
    if len(rd_df) == len(rd_data):
        print('PASS: extracted df has expected no. of rows'
              ,'\nExtracted df dim:'
              ,'\nNo. of rows:', len(rd_df)
              ,'\nNo. of cols:', len(rd_df.columns))
    else:
        print('FAIL: no. of rows mimatch'
              , '\nExpected no. of rows:', len(rd_data)
              , '\nGot no. of rows:', len(rd_df)
              , '\n=====================================================')
    #===============
    # writing file
    #===============
    print('Writing file:'
          , '\nFilename:', outputrdcsv
 #          , '\nPath:',  outdir
 #          , '\nExpected no. of rows:', len(rd_df)
 #          , '\nExpected no. of cols:', len(rd_df.columns)
          , '\n=========================================================')
    rd_df.to_csv(outputrdcsv, header = True, index = False)
 #%% end of function
 #=======================================================================
 #%% call function
 #rd_to_csv(infile, outfile)
 #=======================================================================
 def main():
    print('residue depth using the following params\n'
        , in_filename
        , '\noutfile:', out_filename)
    rd_to_csv(infile, outfile)
    print('Finished Writing file:'
        , '\nFilename:', outfile
        , '\n=============================================================')
 if __name__ == '__main__':
    main()
 #%% end of script    
 #=======================================================================
--- a/scripts/scratch/3byw.pdb
+++ b/scripts/scratch/3byw.pdb
--- a/scripts/scratch/7bvf_b.pdb
+++ b/scripts/scratch/7bvf_b.pdb
--- a/scripts/scratch/align_fastas.txt
+++ b/scripts/scratch/align_fastas.txt
@ -1,15 +0,0 @@
 >Mycobacterium tuberculosis H37Rv|Rv3423c|alr
 VKRFWENVGKPNDTTDGRGTTSLAMTPISQTPGLLAEAMVDLGAIEHNVRVLREHAGHAQLMAVVKADGYGH
 GATRVAQTALGAGAAELGVATVDEALALRADGITAPVLAWLHPPGIDFGPALLADVQVAVSSLRQLDELLHA
 VRRTGRTATVTVKVDTGLNRNGVGPAQFPAMLTALRQAMAEDAVRLRGLMSHMVYADKPDDSINDVQAQRFT
 AFLAQAREQGVRFEVAHLSNSSATMARPDLTFDLVRPGIAVYGLSPVPALGDMGLVPAMTVKCAVALVKSIR
 AGEGVSYGHTWIAPRDTNLALLPIGYADGVFRSLGGRLEVLINGRRCPGVGRICMDQFMVDLGPGPLDVAEG
 DEAILFGPGIRGEPTAQDWADLVGTIHYEVVTSPRGRITRTYREAENR
 >alr_complex | chain A | 371 aa
 LAEAMVDLGAIEHNVRVLREHAGHAQLMAVVKADGYGHGATRVAQTALGAGAAELGVATVDEALALRADGIT
 APVLAWLHPPGIDFGPALLADVQVAVSSLRQLDELLHAVRRTGRTATVTVKVDTGLNRNGVGPAQFPAMLTA
 LRQAMAEDAVRLRGLMSHMVYADKPDDSINDVQAQRFTAFLAQAREQGVRFEVAHLSNSSATMARPDLTFDL
 VRPGIAVYGLSPVPALGDMGLVPAMTVKCAVALVKSIRAGEGVSYGHTWIAPRDTNLALLPIGYADGVFRSL
 GGRLEVLINGRRCPGVGRICMDQFMVDLGPGPLDVAEGDEAILFGPGIRGEPTAQDWADLVGTIHYEVVTSP
 RGRITRTYREA
--- a/scripts/scratch/align_template.py
+++ b/scripts/scratch/align_template.py
@ -1,125 +0,0 @@
 #!/usr/bin/env python3
 # useful links
 #https://towardsdatascience.com/pairwise-sequence-alignment-using-biopython-d1a9d0ba861f
 #https://www.biostars.org/p/265338/
 from Bio import SeqIO
 from Bio import pairwise2
 from Bio.pairwise2 import format_alignment
 import re
 import os
 #%%
 os.chdir('/home/tanu/git/LSHTM_analysis/scripts/examples')
 #%%
 def myalign(ref_seq, pdb_seq):
    myalign_dict = {}
    alignments = pairwise2.align.globalxx(ref_seq, pdb_seq)  
    #alignments = pairwise2.align.localxx(ref, struct)
    match = []
    for a, b in zip(alignments[0][0], alignments[0][1]):
            if a == b:
                    match.append('|')
            else:
                    match.append(' ')
    #print(match)
    print(alignments[0][0])
    print("".join(match))
    print(alignments[0][1])
    result_align = alignments[0][1]
    #print(result_align)
    print('===============================================================\n')
    # update dict
    myalign_dict.update({'aligned_fasta': result_align})
    aa_regex = '\w'
    m = re.search(aa_regex, result_align)
    #m = my_match.span()
    offset = m.start()
    offset_end = m.end()
    print('start of match:', offset
    , '\nend of match:', offset_end)
    print('===============================================================\n')
    # update dict
    myalign_dict.update({'start_match' : offset})
    myalign_dict.update({'end_match' : offset_end})
    return myalign_dict
 def main():
    """
    read file containing reference and pdb_sequence to align
    """
    my_dict = {}
    align_fastas_to_align = open('align_fastas.txt', 'r') 
    for record in SeqIO.parse(align_fastas_to_align,"fasta"):
         myid = record.id
         seq = str(record.seq)
         my_dict.update({myid : seq})
    my_keys = list(my_dict.keys())
    my_ref_seq = my_dict[my_keys[0]] # ref_seq
    my_pdb_seq = my_dict[my_keys[1]] # pdb_seq
    fasta_alignment = myalign(my_ref_seq, my_pdb_seq)
    print('this is my result:', fasta_alignment)
 if __name__ == '__main__':
    main()
 #%% debug: individually
 my_dict = {}
 align_fastas_to_align = open('align_fastas.txt', 'r') 
 for record in SeqIO.parse(align_fastas_to_align,"fasta"):
     myid =record.id
     seq=str(record.seq)
     #print(myid, seq)
     my_dict.update({myid: seq})
 print(my_dict)
 print(my_dict.keys())
 my_keys = list(my_dict.keys())
 alignments = pairwise2.align.globalxx(my_dict[my_keys[0]], my_dict[my_keys[1]])
 match = []
 for a, b in zip(alignments[0][0], alignments[0][1]):
    if a == b:
        match.append('|')
    else:
        match.append(' ')
 #print(match)
 print(alignments[0][0])
 print("".join(match))
 print(alignments[0][1])
 result_align = alignments[0][1]
 #print(result_align)
 print('===============================================================\n')
 #offset = ''
 aa_regex = '\w'
 m = re.search(aa_regex, result_align)
 #m = my_match.span()
 offset = m.start()
 offset_end = m.end()
 print('start of match:', offset, '\nend of match:', offset_end)
 print('===============================================================\n')
--- a/scripts/scratch/alr_complex.pdb
+++ b/scripts/scratch/alr_complex.pdb
--- a/scripts/scratch/chain_extract_template.py
+++ b/scripts/scratch/chain_extract_template.py
@ -1,52 +0,0 @@
 #!/usr/bin/python3
 #=======================================================================
 # TASK: select specified chains from the structure & save a cropped structure with
 # the selected chains. Useful for dimer, etc modelling.
 # link for saving each chain as a separate file
 # https://stackoverflow.com/questions/11685716/how-to-extract-chains-from-a-structure-file
 #=======================================================================
 #%%
 import os, sys
 from Bio.PDB import PDBParser, PDBIO, Select
 #%% homdir and curr dir and local imports
 homedir = os.path.expanduser('~') 
 # set working dir
 os.getcwd()
 os.chdir(homedir + '/git/Data/ethambutol/input/')
 os.getcwd()
 #%%
 # Select() Method to return True for every chain in 'chains'
 class ChainExtract(Select):
    def __init__(self, chain):
        self.chain = chain
    def accept_chain(self, chain):
        #print dir(chain)
        if chain.id in self.chain:
            return 1
        else:
            return 0
 def main():
    p = PDBParser(PERMISSIVE=1)  
    structure = p.get_structure("3byw", "3byw.pdb")
    my_chains = ['G', 'H'] # specify selected chains
    c_names = ''.join(my_chains)
    pdb_chains_file = 'pdb_crop_' + c_names + '.pdb'
    io = PDBIO()
    io.set_structure(structure)
    #io.save(structure.get_id() + "_crop.structure", ChainExtract(my_chains))
    io.save(pdb_chains_file, ChainExtract(my_chains))
 if __name__ == '__main__':
    main()
 #%%
 # test
 #my_chains = ['G', 'H'] # specify selected chains
 #foo = ''.join(my_chains) # to append to file name
 #pdb_chains_file = '_{}.pdb'.format(my_chains)
--- a/scripts/scratch/chain_splitter_template.py
+++ b/scripts/scratch/chain_splitter_template.py
@ -1,49 +0,0 @@
 #!/usr/bin/python3
 #=======================================================================
 # TASK: extract chain from pdb and save each chain as a separate file
 # link for saving each chain as a separate file
 # https://stackoverflow.com/questions/11685716/how-to-extract-chains-from-a-pdb-file
 # command line args
 # https://stackoverflow.com/questions/15753701/how-can-i-pass-a-list-as-a-command-line-argument-with-argparse
 #=======================================================================   
 #%%
 import os, sys
 from Bio.PDB import Select, PDBIO
 from Bio.PDB.PDBParser import PDBParser
 #%% homdir and curr dir and local imports
 homedir = os.path.expanduser('~') 
 # set working dir
 os.getcwd()
 os.chdir(homedir + '/git/LSHTM_analysis/scripts')
 os.getcwd()
 #%%
 class ChainSelect(Select):
    def __init__(self, chain):
        self.chain = chain
    def accept_chain(self, chain):
        if chain.get_id() == self.chain:
            return 1
        else:          
            return 0
 def main(): 
    chains = ['A','B','C','F']
    p = PDBParser(PERMISSIVE=1)       
    #structure = p.get_structure(pdb_file, pdb_file)
    structure = p.get_structure('/home/tanu/git/Data/ethambutol/input/3byw', '/home/tanu/git/Data/ethambutol/input/3byw.pdb')
    #print('STRUCTURE:', structure.get_id())
 #    pdb_filename = print()
    for chain in chains:
        pdb_chain_file = 'pdb_file_chain_{}.pdb'.format(chain)         
        io = PDBIO()               
        io.set_structure(structure)
        io.save('{}'.format(pdb_chain_file), ChainSelect(chain))
 # If run from command line...
 if __name__ == "__main__":
    main()
--- a/scripts/scratch/check_ligand.py
+++ b/scripts/scratch/check_ligand.py
@ -1,45 +0,0 @@
 #!/usr/bin/env python
 import os
 from biopandas.pdb import PandasPdb
 #%%
 homedir = os.path.expanduser('~')
 os.chdir(homedir + '/git/LSHTM_analysis/scripts/examples')
 #%%
 file_list = ['7bvf_b.pdb', 'pnca_complex.pdb', 'alr_complex.pdb']
 file_list = ['7bvf_b.pdb']
 #file_list = ['pnca_complex.pdb']
 file_list = ['alr_complex.pdb']
 BORING_LIGANDS = ["HOH","CA","SO4","IOD","NA","CL","GOL","PO4"]
 #%% df with list
 ligands_dict = {}
 for pdb_id in file_list:
    ppdb = PandasPdb()
    pdb_file = ppdb.read_pdb(pdb_id)        
    het = pdb_file.df['HETATM']
    het_list = list(set(het['residue_name']))
    ligands = [ l for l in het_list if l not in BORING_LIGANDS]
    lig_dict = {pdb_id:ligands}
    #lig_dict = {pdb_id:het_list} # include BORING_LIGANDS
    ligands_dict.update(lig_dict)
    print(ligands_dict)
    print('pdb_code:', pdb_file.code) # works only in case of valid pdb
    print('pdb_code:', pdb_file.pdb_path) #works for bespoke pdb but prints the full path
    print('pdb_code:', os.path.basename(pdb_file.pdb_path)) # prints only the last part i.e filename
 #%%   test with one
 ppdb = PandasPdb()
 pdb_file = ppdb.read_pdb('7bvf_b.pdb')        
 het = pdb_file.df['HETATM']
 het_list = list(set(het['residue_name']))
 print(het_list)
 ligands = [ l for l in het_list if l not in BORING_LIGANDS]
 print(ligands) 
 #%% extract last part from file path
 print(os.path.basename(homedir + '/git/LSHTM_analysis/scripts/examples'))
 print(os.path.basename('alr_complex.pdb'))
 foo = os.path.basename(pdb_file.pdb_path)
 print(foo)
--- a/scripts/scratch/inspect.py
+++ b/scripts/scratch/inspect.py
@ -1,81 +0,0 @@
 #!/usr/bin/env python
 import os
 from Bio.PDB import *
 from biopandas.pdb import PandasPdb
 from collections import defaultdict, OrderedDict
 import pandas as pd
 from functools import reduce
 #%% see verison of pandas
 #print(pd.__version__)
 #%%
 homedir = os.path.expanduser('~')
 os.chdir(homedir + '/git/LSHTM_analysis/scripts/examples')
 # link
 #https://www.pythonprogramming.in/pandas-count-distinct-values-of-one-column-depend-on-another-column.html
 #https://datascience.stackexchange.com/questions/32328/export-pandas-to-dictionary-by-combining-multiple-row-values
 # 3 way merge
 #https://stackoverflow.com/questions/23668427/pandas-three-way-joining-multiple-dataframes-on-columns
 #https://stackoverflow.com/questions/52223045/merge-multiple-dataframes-based-on-a-common-column
 #%% Read data
 file_list = ['7bvf_b.pdb']
 file_list = ['3byw.pdb']
 #file_list = ['7bvf_b.pdb', 'pnca_complex.pdb', '3byw']
 #%%
 for pdb in file_list:
    print(pdb)
    p = PDBParser()
    structure = p.get_structure(pdb, pdb)
    for model in structure:
        for chain in model:
            for residue in chain:
                for atom in residue:
                    #print(atom)
 #%% biopandas
 pdb_dict = {}
 for pdb_id in file_list:
    ppdb = PandasPdb()
    pdb_file = ppdb.read_pdb(pdb_id)
    #dir(pdb_file)
    atm_df = pdb_file.df['ATOM']
    #print('column names:', atm_df.columns)
    pdb_chains = list(set(atm_df['chain_id']))
    print('pdb chains:', pdb_chains)
    total_chains = len(pdb_chains)
    print('total no. of chains:', total_chains)
    chain_info = {}
    #atm_df_s = atm_df.sort_values(by=['atom_number', 'chain_id', 'residue_number'])
    c_start = atm_df.groupby('chain_id').residue_number.min()
    print(c_start)
    c_start_df = pd.DataFrame({'chain_id': c_start.index, 'start_res': c_start.values})    
    c_end = atm_df.groupby('chain_id').residue_number.max()
    print(c_end)
    c_end_df = pd.DataFrame({'chain_id': c_end.index, 'end_res': c_end.values})   
    c_length = atm_df.groupby('chain_id').residue_number.nunique()
    print(c_length)
    c_length_df = pd.DataFrame({'chain_id': c_length.index, 'chain_len': c_length.values})   
    # combine 3 series into and assign 'chain_id' as a column
    # using rlambda function works well (as it should work with whatever number of dataframes you want to merge)
    # using pd.concat creates extra chain id cols
    c_df = reduce(lambda left,right: pd.merge(left,right, on = 'chain_id'), [c_start_df, c_end_df, c_length_df])
    # convert df to dict with 'chain_id' as key and columns as list of values
    chain_dict = c_df.set_index('chain_id').T.to_dict('list')
    print(chain_dict)
 #%% Idea
 #protein_name: total_chains: 8, total ligands/hetatm = 3
 #df of chain details
 #chain start_res end_res len_chain
 #pdb tools script separate one chain 
 # remove water and 
--- a/scripts/scratch/pnca_complex.pdb
+++ b/scripts/scratch/pnca_complex.pdb
--- a/scripts/tidy_split.py
+++ b/scripts/tidy_split.py
@ -0,0 +1,60 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 '''
 Created on Tue Aug  6 12:56:03 2019
@author: tanu
 '''
 #=======================================================================
 #%% load libraries
 import os, sys
 import pandas as pd
 #import numpy as np
 #=======================================================================
 #%% homdir and curr dir and local imports
 #homedir = os.path.expanduser('~') 
 # set working dir
 #os.getcwd()
 #os.chdir(homedir + '/git/LSHTM_analysis/scripts')
 #os.getcwd()
 #%%=====================================================================
 # define the split function
 def tidy_split(df, column, sep = '|', keep = False):
    '''
    Split the values of a column and expand so the new DataFrame has one split
    value per row. Filters rows where the column is missing.
    Params
    ------
    df : pandas.DataFrame
        dataframe with the column to split and expand
    column : str
        the column to split and expand
    sep : str
        the string used to split the column's values
    keep : bool
        whether to retain the presplit value as it's own row
    Returns
    -------
    pandas.DataFrame
        Returns a dataframe with the same columns as `df`.
    '''
    indexes = list()
    new_values = list()
    #df = df.dropna(subset=[column])#!!!!-----see this incase you need to uncomment based on use case
    for i, presplit in enumerate(df[column].astype(str)):
        values = presplit.split(sep)
        if keep and len(values) > 1:
            indexes.append(i)
            new_values.append(presplit)
        for value in values:
            indexes.append(i)
            new_values.append(value)
    new_df = df.iloc[indexes, :].copy()
    new_df[column] = new_values
    return new_df 
 #%%=====================================================================
 #end of tidy_split()
		`@ -1 +0,0 @@`
			`Subproject commit 881ff8f27aaf1db4266a84fb03baad3dab552c64`