updated kd.py to relfect a merging col for combining num params later
This commit is contained in:
parent
d44ab57f5a
commit
37e1d43b76
3 changed files with 137 additions and 82 deletions
|
@ -10,16 +10,7 @@ Created on Tue Aug 6 12:56:03 2019
|
||||||
# concentrate on positions that have structural info?
|
# concentrate on positions that have structural info?
|
||||||
|
|
||||||
# FIXME: import dirs.py to get the basic dir paths available
|
# FIXME: import dirs.py to get the basic dir paths available
|
||||||
|
#=======================================================================
|
||||||
#%% load libraries
|
|
||||||
import os, sys
|
|
||||||
import pandas as pd
|
|
||||||
#import numpy as np
|
|
||||||
|
|
||||||
#from pandas.api.types import is_string_dtype
|
|
||||||
#from pandas.api.types import is_numeric_dtype
|
|
||||||
|
|
||||||
#========================================================
|
|
||||||
# TASK: extract ALL pncA_p. mutations from GWAS data
|
# TASK: extract ALL pncA_p. mutations from GWAS data
|
||||||
# Input data file has the following format: each row = unique sample id
|
# Input data file has the following format: each row = unique sample id
|
||||||
# id,country,lineage,sublineage,drtype,pyrazinamide,dr_mutations_pyrazinamide,other_mutations_pyrazinamide...
|
# id,country,lineage,sublineage,drtype,pyrazinamide,dr_mutations_pyrazinamide,other_mutations_pyrazinamide...
|
||||||
|
@ -38,46 +29,58 @@ import pandas as pd
|
||||||
# 3) pnca_metadata.csv
|
# 3) pnca_metadata.csv
|
||||||
# 4) pnca_all_muts_msa.csv
|
# 4) pnca_all_muts_msa.csv
|
||||||
# 5) pnca_mutational_positons.csv
|
# 5) pnca_mutational_positons.csv
|
||||||
#========================================================
|
#=======================================================================
|
||||||
|
#%% load libraries
|
||||||
|
import os, sys
|
||||||
|
import pandas as pd
|
||||||
|
#import numpy as np
|
||||||
|
|
||||||
|
#from pandas.api.types import is_string_dtype
|
||||||
|
#from pandas.api.types import is_numeric_dtype
|
||||||
|
|
||||||
#%% specify homedir as python doesn't recognise tilde
|
#%% specify homedir as python doesn't recognise tilde
|
||||||
homedir = os.path.expanduser('~')
|
homedir = os.path.expanduser('~')
|
||||||
|
|
||||||
# my working dir
|
# set working dir
|
||||||
os.getcwd()
|
os.getcwd()
|
||||||
os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
|
os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
|
||||||
os.getcwd()
|
os.getcwd()
|
||||||
|
|
||||||
# import aa dict
|
# import aa dict
|
||||||
from reference_dict import my_aa_dict #CHECK DIR STRUC THERE!
|
from reference_dict import my_aa_dict #CHECK DIR STRUC THERE!
|
||||||
#========================================================
|
#=======================================================================
|
||||||
|
|
||||||
#%% variable assignment: input and output paths & filenames
|
#%% variable assignment: input and output paths & filenames
|
||||||
drug = 'pyrazinamide'
|
drug = 'pyrazinamide'
|
||||||
gene = 'pncA'
|
gene = 'pncA'
|
||||||
gene_match = gene + '_p.'
|
gene_match = gene + '_p.'
|
||||||
|
|
||||||
#==========
|
#=======
|
||||||
# input dir
|
# data dir
|
||||||
#==========
|
#=======
|
||||||
#indir = 'git/Data/pyrazinamide/input/original'
|
#indir = 'git/Data/pyrazinamide/input/original'
|
||||||
indir = homedir + '/' + 'git/Data'
|
datadir = homedir + '/' + 'git/Data'
|
||||||
|
|
||||||
#===========
|
#=======
|
||||||
# output dir
|
# input
|
||||||
#===========
|
#=======
|
||||||
|
#indir = 'git/Data/pyrazinamide/input/original'
|
||||||
|
in_filename = 'original_tanushree_data_v2.csv'
|
||||||
|
infile = datadir + '/' + in_filename
|
||||||
|
print('Input filename: ', in_filename
|
||||||
|
, '\nInput path: ', indir)
|
||||||
|
|
||||||
|
#=======
|
||||||
|
# output
|
||||||
|
#=======
|
||||||
# several output files
|
# several output files
|
||||||
# output filenames in respective sections at the time of outputting files
|
# output filenames in respective sections at the time of outputting files
|
||||||
#outdir = 'git/Data/pyrazinamide/output'
|
outdir = datadir + '/' + drug + '/' + 'output'
|
||||||
outdir = homedir + '/' + 'git/Data' + '/' + drug + '/' + 'output'
|
print('Output filename: in the respective sections'
|
||||||
|
, '\nOutput path: ', outdir)
|
||||||
|
|
||||||
#%%end of variable assignment for input and output files
|
#%%end of variable assignment for input and output files
|
||||||
#==============================================================================
|
#=======================================================================
|
||||||
#%% Read files
|
#%% Read input file
|
||||||
|
|
||||||
in_filename = 'original_tanushree_data_v2.csv'
|
|
||||||
infile = indir + '/' + in_filename
|
|
||||||
print('Reading input master file:', infile)
|
|
||||||
|
|
||||||
master_data = pd.read_csv(infile, sep = ',')
|
master_data = pd.read_csv(infile, sep = ',')
|
||||||
|
|
||||||
# column names
|
# column names
|
||||||
|
|
|
@ -1,13 +1,19 @@
|
||||||
#!/usr/bin/python
|
#!/usr/bin/env python3
|
||||||
#%%
|
# -*- coding: utf-8 -*-
|
||||||
|
'''
|
||||||
|
Created on Tue Aug 6 12:56:03 2019
|
||||||
|
|
||||||
|
@author: tanu
|
||||||
|
'''
|
||||||
|
#=======================================================================
|
||||||
# Task: Hydrophobicity (Kd) values for amino acid sequence using the Kyt&-Doolittle
|
# Task: Hydrophobicity (Kd) values for amino acid sequence using the Kyt&-Doolittle
|
||||||
# Same output as using the expasy server https://web.expasy.org/protscale/
|
# Same output as using the expasy server https://web.expasy.org/protscale/
|
||||||
|
|
||||||
# useful links
|
# useful links
|
||||||
# https://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParamData-pysrc.html
|
# https://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParamData-pysrc.html
|
||||||
# https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html
|
# https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html
|
||||||
#%%
|
#=======================================================================
|
||||||
# load packages
|
#%% load packages
|
||||||
from pylab import *
|
from pylab import *
|
||||||
from Bio.SeqUtils import ProtParamData
|
from Bio.SeqUtils import ProtParamData
|
||||||
from Bio.SeqUtils.ProtParam import ProteinAnalysis
|
from Bio.SeqUtils.ProtParam import ProteinAnalysis
|
||||||
|
@ -17,25 +23,46 @@ from Bio import SeqIO
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import sys, os
|
import sys, os
|
||||||
#%%
|
|
||||||
# specify input and output variables
|
#%% specify input and output variables
|
||||||
homedir = os.path.expanduser('~')
|
homedir = os.path.expanduser('~')
|
||||||
|
|
||||||
|
# set working dir
|
||||||
|
os.getcwd()
|
||||||
|
os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
|
||||||
|
os.getcwd()
|
||||||
|
#=======================================================================
|
||||||
|
#%% variable assignment: input and output paths & filenames
|
||||||
|
drug = 'pyrazinamide'
|
||||||
|
gene = 'pncA'
|
||||||
|
gene_match = gene + '_p.'
|
||||||
|
|
||||||
|
#==========
|
||||||
|
# data dir
|
||||||
|
#==========
|
||||||
|
#indir = 'git/Data/pyrazinamide/input/original'
|
||||||
|
datadir = homedir + '/' + 'git/Data'
|
||||||
|
|
||||||
#=======
|
#=======
|
||||||
# input
|
# input
|
||||||
#=======
|
#=======
|
||||||
indir = 'git/Data/pyrazinamide/input/original'
|
#indir = 'git/Data/pyrazinamide/input/original'
|
||||||
in_filename = "3pl1.fasta.txt"
|
indir = datadir + '/' + drug + '/' + 'input'
|
||||||
infile = homedir + '/' + indir + '/' + in_filename
|
in_filename = '3pl1.fasta.txt'
|
||||||
print(infile)
|
infile = indir + '/' + in_filename
|
||||||
|
print('Input filename:', in_filename
|
||||||
|
, '\nInput path:', indir)
|
||||||
|
|
||||||
#=======
|
#=======
|
||||||
# output
|
# output
|
||||||
#=======
|
#=======
|
||||||
outdir = 'git/Data/pyrazinamide/output'
|
outdir = datadir + '/' + drug + '/' + 'output'
|
||||||
out_filename = "kd.csv"
|
out_filename = gene.lower() + '_kd.csv'
|
||||||
outfile = homedir + '/' + outdir + '/' + out_filename
|
outfile = outdir + '/' + out_filename
|
||||||
print(outfile)
|
print('Output filename:', out_filename
|
||||||
#%%
|
, '\nOutput path:', outdir)
|
||||||
|
|
||||||
|
#%%11
|
||||||
# specify window size for hydropathy profile computation
|
# specify window size for hydropathy profile computation
|
||||||
# https://web.expasy.org/protscale/pscale/protscale_help.html
|
# https://web.expasy.org/protscale/pscale/protscale_help.html
|
||||||
my_window = 3
|
my_window = 3
|
||||||
|
@ -43,7 +70,7 @@ offset = round((my_window/2)-0.5)
|
||||||
|
|
||||||
fh = open(infile)
|
fh = open(infile)
|
||||||
|
|
||||||
for record in SeqIO.parse(fh, "fasta"):
|
for record in SeqIO.parse(fh, 'fasta'):
|
||||||
id = record.id
|
id = record.id
|
||||||
seq = record.seq
|
seq = record.seq
|
||||||
num_residues = len(seq)
|
num_residues = len(seq)
|
||||||
|
@ -60,11 +87,23 @@ print('Sequence Length:', num_residues)
|
||||||
print('kd_values Length:',len(kd_values))
|
print('kd_values Length:',len(kd_values))
|
||||||
print('Window Length:', my_window)
|
print('Window Length:', my_window)
|
||||||
print('Window Offset:', offset)
|
print('Window Offset:', offset)
|
||||||
|
print('======================================================================')
|
||||||
|
print('Checking:len(kd values) is as expected for the given window size & offset...')
|
||||||
|
expected_length = num_residues - (my_window - offset)
|
||||||
|
if len(kd_values) == expected_length:
|
||||||
|
print('PASS: expected and actual length of kd values match')
|
||||||
|
else:
|
||||||
|
print('FAIL: length mismatch'
|
||||||
|
,'\nExpected length:', expected_length
|
||||||
|
,'\nActual length:', len(kd_values))
|
||||||
|
|
||||||
# make a df each for; aa sequence and kd_values. Then reset index for each df which will allow easy merging of the two
|
print('======================================================================')
|
||||||
|
#%% make 2 dfs; 1) aa sequence and 2) kd_values. Then reset index for each df
|
||||||
|
# which will allow easy merging of the two dfs.
|
||||||
|
|
||||||
# df1: df of aa seq with index reset to start from 1 (reflective of the actual aa position in a sequence)
|
# df1: df of aa seq with index reset to start from 1 (reflective of the actual aa position in a sequence)
|
||||||
dfSeq = pd.DataFrame({'aa_wt':list(sequence)})
|
# col name for wt is the same as reflected in the the AF_OR file to allow easy merging
|
||||||
|
dfSeq = pd.DataFrame({'wild_type':list(sequence)})
|
||||||
dfSeq.index = np.arange(1, len(dfSeq) + 1) # python is not inclusive
|
dfSeq.index = np.arange(1, len(dfSeq) + 1) # python is not inclusive
|
||||||
|
|
||||||
# df2: df of kd_values with index reset to start from offset + 1 and subsequent matched length of the kd_values
|
# df2: df of kd_values with index reset to start from offset + 1 and subsequent matched length of the kd_values
|
||||||
|
@ -76,19 +115,35 @@ max(dfVals['kd_values'])
|
||||||
min(dfVals['kd_values'])
|
min(dfVals['kd_values'])
|
||||||
|
|
||||||
# Merge the two on index (as these are now reflective of the aa position numbers): df1 and df2
|
# Merge the two on index (as these are now reflective of the aa position numbers): df1 and df2
|
||||||
|
# This will introduce NaN where there is missing values. In our case this will be 2 (first and last ones)
|
||||||
|
# Conveniently, the last position in this case is not part of the struc, so not much loss of info
|
||||||
|
# Needless to state that this will be variable for other targets.
|
||||||
|
|
||||||
df = pd.concat([dfSeq, dfVals], axis = 1)
|
df = pd.concat([dfSeq, dfVals], axis = 1)
|
||||||
# rename index to position
|
# rename index to position
|
||||||
df = df.rename_axis('position')
|
df = df.rename_axis('position')
|
||||||
print(df)
|
print(df)
|
||||||
#%%
|
#%% write file
|
||||||
# write file
|
print('Writing file:', out_filename
|
||||||
|
, '\nFilename:', out_filename
|
||||||
|
, '\nPath:', outdir)
|
||||||
|
|
||||||
df.to_csv(outfile, header = True, index = True)
|
df.to_csv(outfile, header = True, index = True)
|
||||||
|
|
||||||
|
print('Finished writing:', out_filename
|
||||||
|
, '\nNo. of rows:', len(df)
|
||||||
|
, '\nNo. of cols:', len(df.columns))
|
||||||
|
|
||||||
#%% Plot
|
#%% Plot
|
||||||
# http://www.dalkescientific.com/writings/NBN/plotting.html
|
# http://www.dalkescientific.com/writings/NBN/plotting.html
|
||||||
|
|
||||||
|
# FIXME: save fig
|
||||||
|
# extract just pdb if from 'id' to pass to title of plot
|
||||||
|
# foo = re.match(r'(^[0-9]{1}\w{3})', id).groups(1)
|
||||||
plot(kd_values, linewidth = 1.0)
|
plot(kd_values, linewidth = 1.0)
|
||||||
#axis(xmin = 1, xmax = num_residues)
|
#axis(xmin = 1, xmax = num_residues)
|
||||||
xlabel("Residue Number")
|
xlabel('Residue Number')
|
||||||
ylabel("Hydrophobicity")
|
ylabel('Hydrophobicity')
|
||||||
title("K&D Hydrophobicity for " + id)
|
title('K&D Hydrophobicity for ' + id)
|
||||||
show()
|
show()
|
||||||
#%% end of script
|
#%% end of script
|
||||||
|
|
|
@ -7,29 +7,25 @@ Created on Tue Aug 6 12:56:03 2019
|
||||||
'''
|
'''
|
||||||
|
|
||||||
# FIXME: import dirs.py to get the basic dir paths available
|
# FIXME: import dirs.py to get the basic dir paths available
|
||||||
|
#=======================================================================
|
||||||
#%% load libraries
|
|
||||||
###################
|
|
||||||
# load libraries
|
|
||||||
import os, sys
|
|
||||||
import pandas as pd
|
|
||||||
#import numpy as np
|
|
||||||
|
|
||||||
#====================================================
|
|
||||||
# TASK: calculate how many mutations result in
|
# TASK: calculate how many mutations result in
|
||||||
# electrostatic changes wrt wt
|
# electrostatic changes wrt wt
|
||||||
# Input: mcsm and AF_OR file
|
# Input: mcsm and AF_OR file
|
||||||
# Output: mut_elec_changes_results.txt
|
# Output: mut_elec_changes_results.txt
|
||||||
#========================================================
|
#=======================================================================
|
||||||
|
#%% load libraries
|
||||||
|
import os, sys
|
||||||
|
import pandas as pd
|
||||||
|
#import numpy as np
|
||||||
|
|
||||||
#%% specify homedir as python doesn't recognise tilde
|
#%% specify homedir as python doesn't recognise tilde
|
||||||
homedir = os.path.expanduser('~')
|
homedir = os.path.expanduser('~')
|
||||||
|
|
||||||
# my working dir
|
# set working dir
|
||||||
os.getcwd()
|
os.getcwd()
|
||||||
os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
|
os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
|
||||||
os.getcwd()
|
os.getcwd()
|
||||||
|
#=======================================================================
|
||||||
#========================================================
|
|
||||||
#%% variable assignment: input and output paths & filenames
|
#%% variable assignment: input and output paths & filenames
|
||||||
drug = 'pyrazinamide'
|
drug = 'pyrazinamide'
|
||||||
gene = 'pncA'
|
gene = 'pncA'
|
||||||
|
@ -41,28 +37,29 @@ gene_match = gene + '_p.'
|
||||||
#indir = 'git/Data/pyrazinamide/input/original'
|
#indir = 'git/Data/pyrazinamide/input/original'
|
||||||
datadir = homedir + '/' + 'git/Data'
|
datadir = homedir + '/' + 'git/Data'
|
||||||
|
|
||||||
#==========
|
#=======
|
||||||
# input dir
|
# input
|
||||||
#==========
|
#=======
|
||||||
indir = datadir + '/' + drug + '/' + 'input'
|
indir = datadir + '/' + drug + '/' + 'input'
|
||||||
|
in_filename = 'merged_df3.csv'
|
||||||
|
infile = outdir + '/' + in_filename
|
||||||
|
print('Input filename: ', in_filename
|
||||||
|
, '\nInput path: ', indir)
|
||||||
|
|
||||||
#============
|
#=======
|
||||||
# output dir
|
# output
|
||||||
#============
|
#=======
|
||||||
# several output files
|
|
||||||
outdir = datadir + '/' + drug + '/' + 'output'
|
outdir = datadir + '/' + drug + '/' + 'output'
|
||||||
|
|
||||||
# specify output file
|
# specify output file
|
||||||
out_filename = 'mut_elec_changes.txt'
|
out_filename = 'mut_elec_changes.txt'
|
||||||
outfile = outdir + '/' + out_filename
|
outfile = outdir + '/' + out_filename
|
||||||
print('Output path: ', outdir)
|
print('Output filename: ', out_filename
|
||||||
|
, '\nOutput path: ', outdir)
|
||||||
|
|
||||||
#%% end of variable assignment for input and output files
|
#%% end of variable assignment for input and output files
|
||||||
#=============================================================
|
#=======================================================================
|
||||||
#%% Read input files
|
#%% Read input files
|
||||||
#in_filename = gene.lower() + '_meta_data_with_AFandOR.csv'
|
#in_filename = gene.lower() + '_meta_data_with_AFandOR.csv'
|
||||||
in_filename = 'merged_df3.csv'
|
|
||||||
infile = outdir + '/' + in_filename
|
|
||||||
print('Reading input file (merged file):', infile)
|
print('Reading input file (merged file):', infile)
|
||||||
|
|
||||||
comb_df = pd.read_csv(infile, sep = ',')
|
comb_df = pd.read_csv(infile, sep = ',')
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue