diff --git a/scripts/kd_df.py b/scripts/kd_df.py
new file mode 100755
index 0000000..904d92f
--- /dev/null
+++ b/scripts/kd_df.py
@@ -0,0 +1,222 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+'''
+Created on Tue Aug  6 12:56:03 2019
+
+@author: tanu
+'''
+#=======================================================================
+# Task: Hydrophobicity (Kd) values for amino acid sequence using the
+# Kyt&-Doolittle.
+# Same output as using the expasy server (link below)
+# Input: fasta file
+    
+# Output: csv file with 
+
+# useful links
+# https://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParamData-pysrc.html
+# https://web.expasy.org/protscale/pscale/protscale_help.html
+#=======================================================================
+#%% load packages
+import sys, os
+import argparse
+import pandas as pd
+import numpy as np
+from pylab import *
+from Bio.SeqUtils import ProtParamData
+from Bio.SeqUtils.ProtParam import ProteinAnalysis
+from Bio import SeqIO
+#from Bio.Alphabet.IUPAC import IUPACProtein
+import pprint as pp
+#=======================================================================
+#%% specify homedir and curr dir
+homedir = os.path.expanduser('~')
+
+# set working dir
+os.getcwd()
+os.chdir(homedir + '/git/LSHTM_analysis/scripts')
+os.getcwd()
+#=======================================================================
+#%% command line args
+arg_parser = argparse.ArgumentParser()
+#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide')
+#arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive
+arg_parser.add_argument('-d', '--drug', help='drug name', default = 'DRUGNAME')
+arg_parser.add_argument('-g', '--gene', help='gene name', default = 'geneName')
+args = arg_parser.parse_args()
+#=======================================================================
+#%% variable assignment: input and output 
+#drug = 'pyrazinamide'
+#gene = 'pncA'
+drug = args.drug
+gene = args.gene
+gene_match = gene + '_p.'
+
+#==========
+# data dir
+#==========
+datadir = homedir + '/' + 'git/Data'
+
+#=======
+# input
+#=======
+indir = datadir + '/' + drug + '/' + 'input'
+in_filename = '3pl1.fasta.txt'
+infile = indir + '/' + in_filename
+print('Input filename:', in_filename
+      , '\nInput path:', indir
+      , '\n============================================================')
+
+#=======
+# output 
+#=======
+outdir =   datadir + '/' + drug + '/' + 'output'
+out_filename = gene.lower() + '_kd.csv'
+outfile =  outdir + '/' + out_filename
+print('Output filename:', out_filename
+      , '\nOutput path:', outdir
+      , '\n=============================================================')
+#%% end of variable assignment for input and output files
+#=======================================================================
+def kd_to_csv(inputfasta, outputkdcsv, windowsize):
+	"""
+    Calculate kd (hydropathy values) from input fasta file
+
+    @param inputfasta: fasta file
+    @type inputfasta: string
+
+    @param outputkdcsv: csv file with kd values
+    @type outfile: string
+
+    @param windowsize: windowsize to perform KD calcs on (Kyte&-Doolittle)
+    @type DSSP: numeric
+
+    @return:  writes df of kd values as csv
+    @type: .csv
+    """
+    #===================
+    #calculate KD values: same as the expasy server
+    #===================
+	my_window = windowsize
+	offset = round((my_window/2)-0.5)
+
+	fh = open(inputfasta)
+
+	for record in SeqIO.parse(fh, 'fasta'):
+		id = record.id
+		seq = record.seq
+		num_residues = len(seq)
+	fh.close()
+
+	sequence = str(seq)
+	X = ProteinAnalysis(sequence)
+
+	# edge weight is set to  default (100%)
+	kd_values = (X.protein_scale(ProtParamData.kd , window = my_window))
+	# sanity checks 
+	print('Sequence Length:', num_residues)
+	print('kd_values Length:',len(kd_values))
+	print('Window Length:', my_window)
+	print('Window Offset:', offset)
+	print('=================================================================')
+	print('Checking:len(kd values) is as expected for the given window size & offset...')
+	expected_length =  num_residues - (my_window - offset) 
+	if len(kd_values) == expected_length:
+		print('PASS: expected and actual length of kd values match')
+	else:
+		print('FAIL: length mismatch'
+		      ,'\nExpected length:', expected_length
+		      ,'\nActual length:', len(kd_values)
+		      , '\n=========================================================')
+ 
+ 	#===================
+	# creating two dfs
+	#===================
+	# 1) aa sequence and 2) kd_values. Then reset index for each df 
+	# which will allow easy merging of the two dfs.
+
+	# df1: df of aa seq with index reset to start from 1 
+	# (reflective of the actual aa position in a sequence)
+	# Name column of wt as 'wild_type' to be the same name used 
+	# in the file required for merging later.
+	dfSeq = pd.DataFrame({'wild_type_kd':list(sequence)})
+	dfSeq.index = np.arange(1, len(dfSeq) + 1) # python is not inclusive
+
+	# df2: df of kd_values with index reset to start from offset + 1 and 
+	# subsequent matched length of the kd_values
+	dfVals = pd.DataFrame({'kd_values':kd_values})
+	dfVals.index = np.arange(offset + 1, len(dfVals) + 1 + offset)
+
+	# sanity checks
+	max(dfVals['kd_values'])
+	min(dfVals['kd_values'])
+	
+	#===================
+	# concatenating dfs
+	#===================
+	# Merge the two on index 
+	# (as these are now reflective of the aa position numbers): df1 and df2 
+	# This will introduce NaN where there is missing values. In our case this
+	# will be 2 (first and last ones based on window size and offset)
+
+	kd_df = pd.concat([dfSeq, dfVals], axis = 1)
+
+	#============================
+	# renaming index to position
+	#============================
+	kd_df = kd_df.rename_axis('position')
+	kd_df.head
+
+	print('Checking: position col i.e. index should be numeric')
+	if kd_df.index.dtype == 'int64':
+		print('PASS: position col is numeric'
+		      , '\ndtype is:', kd_df.index.dtype)
+	else:
+		print('FAIL: position col is not numeric'
+		      , '\nConverting to numeric')
+		kd_df.index.astype('int64')
+		print('Checking dtype for after conversion:\n'
+		      , '\ndtype is:', kd_df.index.dtype
+		      , '\n=========================================================')
+	#===============
+	# writing file
+	#===============
+	print('Writing file:', out_filename
+		  , '\nFilename:', out_filename
+		  , '\nPath:',  outdir
+		  , '\n=============================================================')
+
+	kd_df.to_csv(outfile, header = True, index = True)
+
+	print('Finished writing:', out_filename
+		  , '\nNo. of rows:', len(kd_df)
+		  , '\nNo. of cols:', len(kd_df.columns)
+		  , '\n=============================================================')
+		  
+	#===============
+	# plot: optional!
+	#===============
+	# http://www.dalkescientific.com/writings/NBN/plotting.html
+
+	# FIXME: save fig
+	# extract just pdb if from 'id' to pass to title of plot
+	# foo = re.match(r'(^[0-9]{1}\w{3})', id).groups(1)
+	plot(kd_values, linewidth = 1.0)
+	#axis(xmin = 1, xmax = num_residues)
+	xlabel('Residue Number')
+	ylabel('Hydrophobicity')
+	title('K&D Hydrophobicity for ' + id)
+	show()
+#%% end of function
+#=======================================================================
+#%% call function
+#kd_to_csv(infile, outfile, windowsize = 3)
+#=======================================================================
+def main():
+	print('Running hydropathy calcs', in_filename, 'output csv:', out_filename)
+	kd_to_csv(infile, outfile, windowsize = 3)
+	
+if __name__ == '__main__':
+	main()
+#%% end of script	
+#=======================================================================