updated kd.py to relfect a merging col for combining num params later

This commit is contained in:
Tanushree Tunstall 2020-03-25 15:20:54 +00:00
parent d44ab57f5a
commit 37e1d43b76
3 changed files with 137 additions and 82 deletions

View file

@ -10,16 +10,7 @@ Created on Tue Aug 6 12:56:03 2019
# concentrate on positions that have structural info? # concentrate on positions that have structural info?
# FIXME: import dirs.py to get the basic dir paths available # FIXME: import dirs.py to get the basic dir paths available
#=======================================================================
#%% load libraries
import os, sys
import pandas as pd
#import numpy as np
#from pandas.api.types import is_string_dtype
#from pandas.api.types import is_numeric_dtype
#========================================================
# TASK: extract ALL pncA_p. mutations from GWAS data # TASK: extract ALL pncA_p. mutations from GWAS data
# Input data file has the following format: each row = unique sample id # Input data file has the following format: each row = unique sample id
# id,country,lineage,sublineage,drtype,pyrazinamide,dr_mutations_pyrazinamide,other_mutations_pyrazinamide... # id,country,lineage,sublineage,drtype,pyrazinamide,dr_mutations_pyrazinamide,other_mutations_pyrazinamide...
@ -38,46 +29,58 @@ import pandas as pd
# 3) pnca_metadata.csv # 3) pnca_metadata.csv
# 4) pnca_all_muts_msa.csv # 4) pnca_all_muts_msa.csv
# 5) pnca_mutational_positons.csv # 5) pnca_mutational_positons.csv
#======================================================== #=======================================================================
#%% load libraries
import os, sys
import pandas as pd
#import numpy as np
#from pandas.api.types import is_string_dtype
#from pandas.api.types import is_numeric_dtype
#%% specify homedir as python doesn't recognise tilde #%% specify homedir as python doesn't recognise tilde
homedir = os.path.expanduser('~') homedir = os.path.expanduser('~')
# my working dir # set working dir
os.getcwd() os.getcwd()
os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis') os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
os.getcwd() os.getcwd()
# import aa dict # import aa dict
from reference_dict import my_aa_dict #CHECK DIR STRUC THERE! from reference_dict import my_aa_dict #CHECK DIR STRUC THERE!
#======================================================== #=======================================================================
#%% variable assignment: input and output paths & filenames #%% variable assignment: input and output paths & filenames
drug = 'pyrazinamide' drug = 'pyrazinamide'
gene = 'pncA' gene = 'pncA'
gene_match = gene + '_p.' gene_match = gene + '_p.'
#========== #=======
# input dir # data dir
#========== #=======
#indir = 'git/Data/pyrazinamide/input/original' #indir = 'git/Data/pyrazinamide/input/original'
indir = homedir + '/' + 'git/Data' datadir = homedir + '/' + 'git/Data'
#=========== #=======
# output dir # input
#=========== #=======
#indir = 'git/Data/pyrazinamide/input/original'
in_filename = 'original_tanushree_data_v2.csv'
infile = datadir + '/' + in_filename
print('Input filename: ', in_filename
, '\nInput path: ', indir)
#=======
# output
#=======
# several output files # several output files
# output filenames in respective sections at the time of outputting files # output filenames in respective sections at the time of outputting files
#outdir = 'git/Data/pyrazinamide/output' outdir = datadir + '/' + drug + '/' + 'output'
outdir = homedir + '/' + 'git/Data' + '/' + drug + '/' + 'output' print('Output filename: in the respective sections'
, '\nOutput path: ', outdir)
#%%end of variable assignment for input and output files #%%end of variable assignment for input and output files
#============================================================================== #=======================================================================
#%% Read files #%% Read input file
in_filename = 'original_tanushree_data_v2.csv'
infile = indir + '/' + in_filename
print('Reading input master file:', infile)
master_data = pd.read_csv(infile, sep = ',') master_data = pd.read_csv(infile, sep = ',')
# column names # column names

View file

@ -1,13 +1,19 @@
#!/usr/bin/python #!/usr/bin/env python3
#%% # -*- coding: utf-8 -*-
'''
Created on Tue Aug 6 12:56:03 2019
@author: tanu
'''
#=======================================================================
# Task: Hydrophobicity (Kd) values for amino acid sequence using the Kyt&-Doolittle # Task: Hydrophobicity (Kd) values for amino acid sequence using the Kyt&-Doolittle
# Same output as using the expasy server https://web.expasy.org/protscale/ # Same output as using the expasy server https://web.expasy.org/protscale/
# useful links # useful links
# https://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParamData-pysrc.html # https://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParamData-pysrc.html
# https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html # https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html
#%% #=======================================================================
# load packages #%% load packages
from pylab import * from pylab import *
from Bio.SeqUtils import ProtParamData from Bio.SeqUtils import ProtParamData
from Bio.SeqUtils.ProtParam import ProteinAnalysis from Bio.SeqUtils.ProtParam import ProteinAnalysis
@ -17,25 +23,46 @@ from Bio import SeqIO
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import sys, os import sys, os
#%%
# specify input and output variables #%% specify input and output variables
homedir = os.path.expanduser('~') homedir = os.path.expanduser('~')
# set working dir
os.getcwd()
os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
os.getcwd()
#=======================================================================
#%% variable assignment: input and output paths & filenames
drug = 'pyrazinamide'
gene = 'pncA'
gene_match = gene + '_p.'
#==========
# data dir
#==========
#indir = 'git/Data/pyrazinamide/input/original'
datadir = homedir + '/' + 'git/Data'
#======= #=======
# input # input
#======= #=======
indir = 'git/Data/pyrazinamide/input/original' #indir = 'git/Data/pyrazinamide/input/original'
in_filename = "3pl1.fasta.txt" indir = datadir + '/' + drug + '/' + 'input'
infile = homedir + '/' + indir + '/' + in_filename in_filename = '3pl1.fasta.txt'
print(infile) infile = indir + '/' + in_filename
print('Input filename:', in_filename
, '\nInput path:', indir)
#======= #=======
# output # output
#======= #=======
outdir = 'git/Data/pyrazinamide/output' outdir = datadir + '/' + drug + '/' + 'output'
out_filename = "kd.csv" out_filename = gene.lower() + '_kd.csv'
outfile = homedir + '/' + outdir + '/' + out_filename outfile = outdir + '/' + out_filename
print(outfile) print('Output filename:', out_filename
#%% , '\nOutput path:', outdir)
#%%11
# specify window size for hydropathy profile computation # specify window size for hydropathy profile computation
# https://web.expasy.org/protscale/pscale/protscale_help.html # https://web.expasy.org/protscale/pscale/protscale_help.html
my_window = 3 my_window = 3
@ -43,7 +70,7 @@ offset = round((my_window/2)-0.5)
fh = open(infile) fh = open(infile)
for record in SeqIO.parse(fh, "fasta"): for record in SeqIO.parse(fh, 'fasta'):
id = record.id id = record.id
seq = record.seq seq = record.seq
num_residues = len(seq) num_residues = len(seq)
@ -58,14 +85,26 @@ kd_values = (X.protein_scale(ProtParamData.kd , window = my_window)) # edge weig
# sanity checks # sanity checks
print('Sequence Length:', num_residues) print('Sequence Length:', num_residues)
print('kd_values Length:',len(kd_values)) print('kd_values Length:',len(kd_values))
print('Window Length:',my_window) print('Window Length:', my_window)
print('Window Offset:',offset) print('Window Offset:', offset)
print('======================================================================')
# make a df each for; aa sequence and kd_values. Then reset index for each df which will allow easy merging of the two print('Checking:len(kd values) is as expected for the given window size & offset...')
expected_length = num_residues - (my_window - offset)
if len(kd_values) == expected_length:
print('PASS: expected and actual length of kd values match')
else:
print('FAIL: length mismatch'
,'\nExpected length:', expected_length
,'\nActual length:', len(kd_values))
print('======================================================================')
#%% make 2 dfs; 1) aa sequence and 2) kd_values. Then reset index for each df
# which will allow easy merging of the two dfs.
# df1: df of aa seq with index reset to start from 1 (reflective of the actual aa position in a sequence) # df1: df of aa seq with index reset to start from 1 (reflective of the actual aa position in a sequence)
dfSeq = pd.DataFrame({'aa_wt':list(sequence)}) # col name for wt is the same as reflected in the the AF_OR file to allow easy merging
dfSeq.index = np.arange(1, len(dfSeq) + 1) #python is not inclusive dfSeq = pd.DataFrame({'wild_type':list(sequence)})
dfSeq.index = np.arange(1, len(dfSeq) + 1) # python is not inclusive
# df2: df of kd_values with index reset to start from offset + 1 and subsequent matched length of the kd_values # df2: df of kd_values with index reset to start from offset + 1 and subsequent matched length of the kd_values
dfVals = pd.DataFrame({'kd_values':kd_values}) dfVals = pd.DataFrame({'kd_values':kd_values})
@ -76,19 +115,35 @@ max(dfVals['kd_values'])
min(dfVals['kd_values']) min(dfVals['kd_values'])
# Merge the two on index (as these are now reflective of the aa position numbers): df1 and df2 # Merge the two on index (as these are now reflective of the aa position numbers): df1 and df2
# This will introduce NaN where there is missing values. In our case this will be 2 (first and last ones)
# Conveniently, the last position in this case is not part of the struc, so not much loss of info
# Needless to state that this will be variable for other targets.
df = pd.concat([dfSeq, dfVals], axis = 1) df = pd.concat([dfSeq, dfVals], axis = 1)
# rename index to position # rename index to position
df = df.rename_axis('position') df = df.rename_axis('position')
print(df) print(df)
#%% #%% write file
# write file print('Writing file:', out_filename
, '\nFilename:', out_filename
, '\nPath:', outdir)
df.to_csv(outfile, header = True, index = True) df.to_csv(outfile, header = True, index = True)
print('Finished writing:', out_filename
, '\nNo. of rows:', len(df)
, '\nNo. of cols:', len(df.columns))
#%% Plot #%% Plot
# http://www.dalkescientific.com/writings/NBN/plotting.html # http://www.dalkescientific.com/writings/NBN/plotting.html
# FIXME: save fig
# extract just pdb if from 'id' to pass to title of plot
# foo = re.match(r'(^[0-9]{1}\w{3})', id).groups(1)
plot(kd_values, linewidth = 1.0) plot(kd_values, linewidth = 1.0)
#axis(xmin = 1, xmax = num_residues) #axis(xmin = 1, xmax = num_residues)
xlabel("Residue Number") xlabel('Residue Number')
ylabel("Hydrophobicity") ylabel('Hydrophobicity')
title("K&D Hydrophobicity for " + id) title('K&D Hydrophobicity for ' + id)
show() show()
#%% end of script #%% end of script

View file

@ -7,29 +7,25 @@ Created on Tue Aug 6 12:56:03 2019
''' '''
# FIXME: import dirs.py to get the basic dir paths available # FIXME: import dirs.py to get the basic dir paths available
#=======================================================================
#%% load libraries
###################
# load libraries
import os, sys
import pandas as pd
#import numpy as np
#====================================================
# TASK: calculate how many mutations result in # TASK: calculate how many mutations result in
# electrostatic changes wrt wt # electrostatic changes wrt wt
# Input: mcsm and AF_OR file # Input: mcsm and AF_OR file
# Output: mut_elec_changes_results.txt # Output: mut_elec_changes_results.txt
#======================================================== #=======================================================================
#%% load libraries
import os, sys
import pandas as pd
#import numpy as np
#%% specify homedir as python doesn't recognise tilde #%% specify homedir as python doesn't recognise tilde
homedir = os.path.expanduser('~') homedir = os.path.expanduser('~')
# my working dir # set working dir
os.getcwd() os.getcwd()
os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis') os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
os.getcwd() os.getcwd()
#=======================================================================
#========================================================
#%% variable assignment: input and output paths & filenames #%% variable assignment: input and output paths & filenames
drug = 'pyrazinamide' drug = 'pyrazinamide'
gene = 'pncA' gene = 'pncA'
@ -41,28 +37,29 @@ gene_match = gene + '_p.'
#indir = 'git/Data/pyrazinamide/input/original' #indir = 'git/Data/pyrazinamide/input/original'
datadir = homedir + '/' + 'git/Data' datadir = homedir + '/' + 'git/Data'
#========== #=======
# input dir # input
#========== #=======
indir = datadir + '/' + drug + '/' + 'input' indir = datadir + '/' + drug + '/' + 'input'
in_filename = 'merged_df3.csv'
infile = outdir + '/' + in_filename
print('Input filename: ', in_filename
, '\nInput path: ', indir)
#============ #=======
# output dir # output
#============ #=======
# several output files
outdir = datadir + '/' + drug + '/' + 'output' outdir = datadir + '/' + drug + '/' + 'output'
# specify output file # specify output file
out_filename = 'mut_elec_changes.txt' out_filename = 'mut_elec_changes.txt'
outfile = outdir + '/' + out_filename outfile = outdir + '/' + out_filename
print('Output path: ', outdir) print('Output filename: ', out_filename
, '\nOutput path: ', outdir)
#%% end of variable assignment for input and output files #%% end of variable assignment for input and output files
#============================================================= #=======================================================================
#%% Read input files #%% Read input files
#in_filename = gene.lower() + '_meta_data_with_AFandOR.csv' #in_filename = gene.lower() + '_meta_data_with_AFandOR.csv'
in_filename = 'merged_df3.csv'
infile = outdir + '/' + in_filename
print('Reading input file (merged file):', infile) print('Reading input file (merged file):', infile)
comb_df = pd.read_csv(infile, sep = ',') comb_df = pd.read_csv(infile, sep = ',')