From 1262df40c971832cda279110d2dce0d7de47e18e Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Fri, 11 Sep 2020 10:27:56 +0100 Subject: [PATCH] results for electrostatic changes --- scripts/mut_electrostatic_changes.py | 130 +++++++++++++++++---------- 1 file changed, 84 insertions(+), 46 deletions(-) diff --git a/scripts/mut_electrostatic_changes.py b/scripts/mut_electrostatic_changes.py index 20ea86a..3f44df2 100755 --- a/scripts/mut_electrostatic_changes.py +++ b/scripts/mut_electrostatic_changes.py @@ -11,7 +11,7 @@ Created on Tue Aug 6 12:56:03 2019 # TASK: calculate how many mutations result in # electrostatic changes wrt wt -# Input: mcsm and AF_OR file +# Input: merged_df3 # Output: mut_elec_changes_results.txt #======================================================================= @@ -19,70 +19,91 @@ Created on Tue Aug 6 12:56:03 2019 import os, sys import pandas as pd #import numpy as np +#from varname import nameof +import argparse #======================================================================= -#%% specify homedir and curr dir -homedir = os.path.expanduser('~') +#%% specify input and curr dir +homedir = os.path.expanduser('~') # set working dir os.getcwd() -os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis') +os.chdir(homedir + '/git/LSHTM_analysis/scripts') os.getcwd() #======================================================================= -#%% variable assignment: input and output -drug = 'pyrazinamide' -gene = 'pncA' -gene_match = gene + '_p.' +#%% command line args +arg_parser = argparse.ArgumentParser() +arg_parser.add_argument('-d', '--drug', help='drug name', default = None) +arg_parser.add_argument('-g', '--gene', help='gene name', default = None) -#========== -# data dir -#========== -#indir = 'git/Data/pyrazinamide/input/original' -datadir = homedir + '/' + 'git/Data' +arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data') +arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + + input') +arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + + output') + +arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode') + +args = arg_parser.parse_args() +#%% variable assignment: input and output +#drug = 'pyrazinamide' +#gene = 'pncA' + +drug = args.drug +gene = args.gene +datadir = args.datadir +indir = args.input_dir +outdir = args.output_dir + +#%%======================================================================= +#============== +# directories +#============== +if not datadir: + datadir = homedir + '/' + 'git/Data' + +if not indir: + indir = datadir + '/' + drug + '/input' + +if not outdir: + outdir = datadir + '/' + drug + '/output' #======= # input #======= -indir = datadir + '/' + drug + '/' + 'input' in_filename = 'merged_df3.csv' -infile = outdir + '/' + in_filename -print('Input filename: ', in_filename - , '\nInput path: ', indir +infile_merged_df3 = outdir + '/' + in_filename +print('Input file: ', infile_merged_df3 , '\n============================================================') #======= # output #======= -outdir = datadir + '/' + drug + '/' + 'output' -# specify output file out_filename = 'mut_elec_changes.txt' -outfile = outdir + '/' + out_filename -print('Output filename: ', out_filename - , '\nOutput path: ', outdir +outfile_elec_changes = outdir + '/' + out_filename +print('Output file: ', outfile_elec_changes , '\n============================================================') #%% end of variable assignment for input and output files #======================================================================= #%% Read input files -print('Reading input file (merged file):', infile) +print('Reading input file (merged file):', infile_merged_df3) -comb_df = pd.read_csv(infile, sep = ',') +comb_df = pd.read_csv(infile_merged_df3, sep = ',') print('Input filename: ', in_filename , '\nPath :', outdir , '\nNo. of rows: ', len(comb_df) - , '\nNo. of cols: ', infile + , '\nNo. of cols: ', len(comb_df.columns) , '\n============================================================') # column names list(comb_df.columns) # clear variables -del(in_filename, infile) +del(in_filename, infile_merged_df3) #%% subset unique mutations -df = comb_df.drop_duplicates(['Mutationinformation'], keep = 'first') +df = comb_df.drop_duplicates(['mutationinformation'], keep = 'first') -total_muts = df.Mutationinformation.nunique() +total_muts = df['mutationinformation'].nunique() #df.Mutationinformation.count() print('Total mutations associated with structure: ', total_muts , '\n===============================================================') @@ -91,11 +112,12 @@ print('Total mutations associated with structure: ', total_muts # check if all muts have been categorised print('Checking if all muts have been categorised: ') if df['wt_calcprop'].isna().sum() == 0 & df['mut_calcprop'].isna().sum(): - print('PASS: No. NA detected i.e all muts have aa prop associated' + print('PASS: No. NA detected. All muts have aa prop associated' , '\n===============================================================') else: - print('FAIL: NAs detected i.e some muts remain unclassified' + print('FAIL: NAs detected. Some muts remain unclassified' , '\n===============================================================') + sys.exit() df['wt_calcprop'].head() df['mut_calcprop'].head() @@ -109,7 +131,7 @@ mut_categ = df["aa_calcprop_combined"].unique() print('Total no. of aa_calc properties: ', len(mut_categ)) print('Categories are: ', mut_categ) -# counting no. of muts in each mut categ +# Counting no. of muts in each mut categ # way1: count values within each combinaton df.groupby('aa_calcprop_combined').size() @@ -137,31 +159,47 @@ print('Total no.of muts with elec changes: ', elec_count) # calculate percentage of electrostatic changes elec_changes = (elec_count/total_muts) * 100 -print('Total number of electrostatic changes resulting from Mutation is (%):', elec_changes) +print('\n===============================================================' + , '\nResults for electrostatic changes from nsSNPs for', gene,':' + , '\n\nTotal no. of mutations:', total_muts + , '\nMutations with electrostatic changes:', elec_count + , '\nPercentage(%) of electrostatic changes:', elec_changes + , '\n===============================================================') # check no change muts no_change_muts = ap_df[ap_df['aa_calcprop'].isin(['neg->neg','non-polar->non-polar','polar->polar', 'pos->pos']) == True] no_change_muts.mut_count.sum() + +if no_change_muts.mut_count.sum() + elec_count == total_muts: + print('\nPASS: numbers cross checked' + , '\nWriting output file:', outfile_elec_changes) +else: + print('\nFAIL: numbers mismatch') + sys.exit() #%% output from console #sys.stdout = open(file, 'w') -sys.stdout = open(outfile, 'w') +sys.stdout = open(outfile_elec_changes, 'w') -print('======================\n' - ,'Unchanged muts' - ,'\n=====================\n' - , no_change_muts - ,'\n=============================\n' - , 'Muts with changed prop:' +print('################################################' + , '\nResults for gene:', gene + , '\n################################################' + , '\n\n======================' + , '\nUnchanged muts:' + , '\n=====================\n' + , no_change_muts + , '\n=============================\n' + , 'Muts with changed properties:' , '\n============================\n' - , all_prop_change) + , all_prop_change + , '\n=====================================================') -print('=================================================================' -, '\nTotal number of electrostatic changes resulting from Mtation is (%):', elec_changes -, '\nTotal no. of muts: ', total_muts -, '\nTotal no. of changed muts: ', all_prop_change.mut_count.sum() -, '\nTotal no. of unchanged muts: ', no_change_muts.mut_count.sum() -, '\n===================================================================') +print('Total no. of mutations: ', total_muts + , '\nTotal no. of mutions with electrostatic changes:', elec_count + , '\nCorresponding (%):', elec_changes + , '\nTotal no. of changed muts: ', all_prop_change.mut_count.sum() + , '\nTotal no. of unchanged muts: ', no_change_muts.mut_count.sum() + , '\n=======================================================') #%% end of script #=======================================================================