From bead9a48bd6235d5b4a0b3d9c620af61617f2d3e Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Tue, 7 Apr 2020 17:42:59 +0100 Subject: [PATCH] adapted rd_df script to make it take command line args and define function --- scripts/rd_df.py | 175 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 175 insertions(+) create mode 100755 scripts/rd_df.py diff --git a/scripts/rd_df.py b/scripts/rd_df.py new file mode 100755 index 0000000..b4d0f43 --- /dev/null +++ b/scripts/rd_df.py @@ -0,0 +1,175 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +''' +Created on Tue Aug 6 12:56:03 2019 + +@author: tanu +''' +#============================================================================= +# Task: Residue depth (rd) processing to generate a df with residue_depth(rd) +# values + +# FIXME +# Input: '.tsv' i.e residue depth txt file (output from .zip file manually +# downloaded from the website). +# This should be integrated into the pipeline + +# Output: .csv with 3 cols i.e position, rd_values & 3-letter wt aa code(caps) +#============================================================================= +#%% load packages +import sys, os +import argparse +import pandas as pd +#============================================================================= +#%% specify input and curr dir +homedir = os.path.expanduser('~') + +# set working dir +os.getcwd() +os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis') +os.getcwd() +#======================================================================= +#%% command line args +arg_parser = argparse.ArgumentParser() +#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide') +#arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive +arg_parser.add_argument('-d', '--drug', help='drug name', default = 'DRUGNAME') +arg_parser.add_argument('-g', '--gene', help='gene name', default = 'geneName') +args = arg_parser.parse_args() +#======================================================================= +#%% variable assignment: input and output +#drug = 'pyrazinamide' +#gene = 'pncA' +drug = args.drug +gene = args.gene +gene_match = gene + '_p.' + +#========== +# data dir +#========== +datadir = homedir + '/' + 'git/Data' + +#======= +# input +#======= +outdir = datadir + '/' + drug + '/' + 'output' +in_filename = '3pl1_rd.tsv' +infile = outdir + '/' + in_filename +print('Input filename:', in_filename + , '\nInput path:', outdir + , '\n=============================================================') +#======= +# output +#======= +outdir = datadir + '/' + drug + '/' + 'output' +out_filename = gene.lower() + '_rd.csv' +outfile = outdir + '/' + out_filename +print('Output filename:', out_filename + , '\nOutput path:', outdir + , '\n=============================================================') + +#%% end of variable assignment for input and output files +#======================================================================= +#%% rd values from _rd.tsv values +def rd_to_csv(inputtsv, outputrdcsv): + """ + Calculate kd (hydropathy values) from input fasta file + + @param inputtsv: tsv file downloaded from {INSERT LINK} + @type inputtsv: string + + @param outputrdsv: csv file with rd values + @type outfile: string + + @return: writes df of kd values as csv + @type: .csv + """ + #======================== + # read downloaded tsv file + #======================== + #%% Read input file + rd_data = pd.read_csv(inputtsv, sep = '\t') + print('Reading input file:', inputtsv + , '\nNo. of rows:', len(rd_data) + , '\nNo. of cols:', len(rd_data.columns)) + + print('Column names:', rd_data.columns + , '\n===============================================================') + #======================== + # creating position col + #======================== + # Extracting residue number from index and assigning + # the values to a column [position]. Then convert the position col to numeric. + rd_data['position'] = rd_data.index.str.extract('([0-9]+)').values + + # converting position to numeric + rd_data['position'] = pd.to_numeric(rd_data['position']) + rd_data['position'].dtype + + print('Extracted residue num from index and assigned as a column:' + , '\ncolumn name: position' + , '\ntotal no. of cols now:', len(rd_data.columns) + , '\n=============================================================') + + #======================== + # Renaming amino-acid + # and all-atom cols + #======================== + print('Renaming columns:' + , '\ncolname==> # chain:residue: wt_3letter_caps' + , '\nYES... the column name *actually* contains a # ..!' + , '\ncolname==> all-atom: rd_values' + , '\n=============================================================') + + rd_data.rename(columns = {'# chain:residue':'wt_3letter_caps', 'all-atom':'rd_values'}, inplace = True) + print('Column names:', rd_data.columns) + + #======================== + # extracting df with the + # desired columns + #======================== + print('Extracting relevant columns for writing df as csv') + + rd_df = rd_data[['position','rd_values','wt_3letter_caps']] + + if len(rd_df) == len(rd_data): + print('PASS: extracted df has expected no. of rows' + ,'\nExtracted df dim:' + ,'\nNo. of rows:', len(rd_df) + ,'\nNo. of cols:', len(rd_df.columns)) + else: + print('FAIL: no. of rows mimatch' + , '\nExpected no. of rows:', len(rd_data) + , '\nGot no. of rows:', len(rd_df) + , '\n=====================================================') + #=============== + # writing file + #=============== + print('Writing file:' + , '\nFilename:', outputrdcsv +# , '\nPath:', outdir +# , '\nExpected no. of rows:', len(rd_df) +# , '\nExpected no. of cols:', len(rd_df.columns) + , '\n=========================================================') + + rd_df.to_csv(outputrdcsv, header = True, index = False) + +#%% end of function +#======================================================================= +#%% call function +#rd_to_csv(infile, outfile) +#======================================================================= +def main(): + print('Running hydropathy calcs', in_filename, 'output csv:', out_filename) + rd_to_csv(infile, outfile) + print('Finished Writing file:' + , '\nFilename:', outfile + , '\nPath:', outdir +## , '\nNo. of rows:', '' +## , '\nNo. of cols:', '' + , '\n=============================================================') + +if __name__ == '__main__': + main() +#%% end of script +#=======================================================================