LSHTM_analysis/meta_data_analysis/kd_df.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
'''
Created on Tue Aug  6 12:56:03 2019

@author: tanu
'''
#=======================================================================
# Task: Hydrophobicity (Kd) values for amino acid sequence using the
# Kyt&-Doolittle.
# Same output as using the expasy server https://web.expasy.org/protscale/
# Input: fasta file

# Output: csv file with


# useful links
# https://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParamData-pysrc.html
# https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html
#=======================================================================
#%% load packages
from pylab import *
from Bio.SeqUtils import ProtParamData
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio import SeqIO
#from Bio.Alphabet.IUPAC import IUPACProtein
#import pprint as pp
import pandas as pd
import numpy as np
import sys, os
#=======================================================================
#%% specify input and curr dir
homedir = os.path.expanduser('~')

# set working dir
os.getcwd()
os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
os.getcwd()
#=======================================================================
#%% variable assignment: input and output
drug = 'pyrazinamide'
gene = 'pncA'
gene_match = gene + '_p.'

#==========
# data dir
#==========
#indir = 'git/Data/pyrazinamide/input/original'
datadir = homedir + '/' + 'git/Data'

#=======
# input
#=======
#indir = 'git/Data/pyrazinamide/input/original'
indir = datadir + '/' + drug + '/' + 'input'
in_filename = '3pl1.fasta.txt'
infile = indir + '/' + in_filename
print('Input filename:', in_filename
      , '\nInput path:', indir
      , '\n============================================================')

#=======
# output
#=======
outdir =   datadir + '/' + drug + '/' + 'output'
out_filename = gene.lower() + '_kd.csv'
outfile =  outdir + '/' + out_filename
print('Output filename:', out_filename
      , '\nOutput path:', outdir
      , '\n=============================================================')
#%% end of variable assignment for input and output files
#=======================================================================
#===================
#calculate KD values: same as the expasy server
#===================
#%%specify window size for hydropathy profile computation
# https://web.expasy.org/protscale/pscale/protscale_help.html
my_window = 3
offset = round((my_window/2)-0.5)

fh = open(infile)

for record in SeqIO.parse(fh, 'fasta'):
    id = record.id
    seq = record.seq
    num_residues = len(seq)
fh.close()

sequence = str(seq)

X = ProteinAnalysis(sequence)

kd_values = (X.protein_scale(ProtParamData.kd , window = my_window)) # edge weight is set to  default (100%)

# sanity checks
print('Sequence Length:', num_residues)
print('kd_values Length:',len(kd_values))
print('Window Length:', my_window)
print('Window Offset:', offset)
print('=================================================================')
print('Checking:len(kd values) is as expected for the given window size & offset...')
expected_length =  num_residues - (my_window - offset)
if len(kd_values) == expected_length:
    print('PASS: expected and actual length of kd values match')
else:
    print('FAIL: length mismatch'
          ,'\nExpected length:', expected_length
          ,'\nActual length:', len(kd_values)
          , '\n=========================================================')
#===================
# creating two dfs
#===================
#%% make 2 dfs; 1) aa sequence and 2) kd_values. Then reset index for each df
# which will allow easy merging of the two dfs.

# df1: df of aa seq with index reset to start from 1 (reflective of the actual aa position in a sequence)
# Name column of wt as 'wild_type' to be the same name used in the file required for merging later.
dfSeq = pd.DataFrame({'wild_type_kd':list(sequence)})
dfSeq.index = np.arange(1, len(dfSeq) + 1) # python is not inclusive

# df2: df of kd_values with index reset to start from offset + 1 and subsequent matched length of the kd_values
dfVals = pd.DataFrame({'kd_values':kd_values})
dfVals.index = np.arange(offset + 1, len(dfVals) + 1 + offset)

# sanity checks
max(dfVals['kd_values'])
min(dfVals['kd_values'])

#===================
# concatenating dfs
#===================
# Merge the two on index
# (as these are now reflective of the aa position numbers): df1 and df2
# This will introduce NaN where there is missing values. In our case this
# will be 2 (first and last ones based on window size and offset)
# In our case this will be 2 (first and last ones)
# For pnca: the last position is not part of the struc, so not info loss
# Needless to say that this will be variable for other targets.

kd_df = pd.concat([dfSeq, dfVals], axis = 1)

#============================
# renaming index to position
#============================
kd_df = kd_df.rename_axis('position')
kd_df.head
print('=================================================================')

print('position col i.e. index should be numeric
	, '\n===============================================================')

if kd_df.index.dtype == 'int64':
    print('PASS: position col is numeric'
          , '\ndtype is:', kd_df.index.dtype)
else:
    print('FAIL: position col is not numeric'
          , '\nConverting to numeric')
    kd_df.index.astype('int64')
    print('Checking dtype for after conversion:\n'
          , '\ndtype is:', kd_df.index.dtype
          , '\n=========================================================')
#===============
# writing file
#===============
print('Writing file:', out_filename
      , '\nFilename:', out_filename
      , '\nPath:',  outdir
      , '\n=============================================================')

kd_df.to_csv(outfile, header = True, index = True)

print('Finished writing:', out_filename
      , '\nNo. of rows:', len(kd_df)
      , '\nNo. of cols:', len(kd_df.columns)
      , '\n=============================================================')

#===============
# plot: optional!
#===============#%% plot
# http://www.dalkescientific.com/writings/NBN/plotting.html

# FIXME: save fig
# extract just pdb if from 'id' to pass to title of plot
# foo = re.match(r'(^[0-9]{1}\w{3})', id).groups(1)
plot(kd_values, linewidth = 1.0)
#axis(xmin = 1, xmax = num_residues)
xlabel('Residue Number')
ylabel('Hydrophobicity')
title('K&D Hydrophobicity for ' + id)
show()
print('======================================================================')
#%% end of script
#=======================================================================