results for electrostatic changes

This commit is contained in:
Tanushree Tunstall 2020-09-11 10:27:56 +01:00
parent 078644c322
commit 1262df40c9

View file

@ -11,7 +11,7 @@ Created on Tue Aug 6 12:56:03 2019
# TASK: calculate how many mutations result in # TASK: calculate how many mutations result in
# electrostatic changes wrt wt # electrostatic changes wrt wt
# Input: mcsm and AF_OR file # Input: merged_df3
# Output: mut_elec_changes_results.txt # Output: mut_elec_changes_results.txt
#======================================================================= #=======================================================================
@ -19,70 +19,91 @@ Created on Tue Aug 6 12:56:03 2019
import os, sys import os, sys
import pandas as pd import pandas as pd
#import numpy as np #import numpy as np
#from varname import nameof
import argparse
#======================================================================= #=======================================================================
#%% specify homedir and curr dir #%% specify input and curr dir
homedir = os.path.expanduser('~') homedir = os.path.expanduser('~')
# set working dir # set working dir
os.getcwd() os.getcwd()
os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis') os.chdir(homedir + '/git/LSHTM_analysis/scripts')
os.getcwd() os.getcwd()
#======================================================================= #=======================================================================
#%% variable assignment: input and output #%% command line args
drug = 'pyrazinamide' arg_parser = argparse.ArgumentParser()
gene = 'pncA' arg_parser.add_argument('-d', '--drug', help='drug name', default = None)
gene_match = gene + '_p.' arg_parser.add_argument('-g', '--gene', help='gene name', default = None)
#========== arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
# data dir arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
#========== arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
#indir = 'git/Data/pyrazinamide/input/original'
datadir = homedir + '/' + 'git/Data' arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode')
args = arg_parser.parse_args()
#%% variable assignment: input and output
#drug = 'pyrazinamide'
#gene = 'pncA'
drug = args.drug
gene = args.gene
datadir = args.datadir
indir = args.input_dir
outdir = args.output_dir
#%%=======================================================================
#==============
# directories
#==============
if not datadir:
datadir = homedir + '/' + 'git/Data'
if not indir:
indir = datadir + '/' + drug + '/input'
if not outdir:
outdir = datadir + '/' + drug + '/output'
#======= #=======
# input # input
#======= #=======
indir = datadir + '/' + drug + '/' + 'input'
in_filename = 'merged_df3.csv' in_filename = 'merged_df3.csv'
infile = outdir + '/' + in_filename infile_merged_df3 = outdir + '/' + in_filename
print('Input filename: ', in_filename print('Input file: ', infile_merged_df3
, '\nInput path: ', indir
, '\n============================================================') , '\n============================================================')
#======= #=======
# output # output
#======= #=======
outdir = datadir + '/' + drug + '/' + 'output'
# specify output file
out_filename = 'mut_elec_changes.txt' out_filename = 'mut_elec_changes.txt'
outfile = outdir + '/' + out_filename outfile_elec_changes = outdir + '/' + out_filename
print('Output filename: ', out_filename print('Output file: ', outfile_elec_changes
, '\nOutput path: ', outdir
, '\n============================================================') , '\n============================================================')
#%% end of variable assignment for input and output files #%% end of variable assignment for input and output files
#======================================================================= #=======================================================================
#%% Read input files #%% Read input files
print('Reading input file (merged file):', infile) print('Reading input file (merged file):', infile_merged_df3)
comb_df = pd.read_csv(infile, sep = ',') comb_df = pd.read_csv(infile_merged_df3, sep = ',')
print('Input filename: ', in_filename print('Input filename: ', in_filename
, '\nPath :', outdir , '\nPath :', outdir
, '\nNo. of rows: ', len(comb_df) , '\nNo. of rows: ', len(comb_df)
, '\nNo. of cols: ', infile , '\nNo. of cols: ', len(comb_df.columns)
, '\n============================================================') , '\n============================================================')
# column names # column names
list(comb_df.columns) list(comb_df.columns)
# clear variables # clear variables
del(in_filename, infile) del(in_filename, infile_merged_df3)
#%% subset unique mutations #%% subset unique mutations
df = comb_df.drop_duplicates(['Mutationinformation'], keep = 'first') df = comb_df.drop_duplicates(['mutationinformation'], keep = 'first')
total_muts = df.Mutationinformation.nunique() total_muts = df['mutationinformation'].nunique()
#df.Mutationinformation.count() #df.Mutationinformation.count()
print('Total mutations associated with structure: ', total_muts print('Total mutations associated with structure: ', total_muts
, '\n===============================================================') , '\n===============================================================')
@ -91,11 +112,12 @@ print('Total mutations associated with structure: ', total_muts
# check if all muts have been categorised # check if all muts have been categorised
print('Checking if all muts have been categorised: ') print('Checking if all muts have been categorised: ')
if df['wt_calcprop'].isna().sum() == 0 & df['mut_calcprop'].isna().sum(): if df['wt_calcprop'].isna().sum() == 0 & df['mut_calcprop'].isna().sum():
print('PASS: No. NA detected i.e all muts have aa prop associated' print('PASS: No. NA detected. All muts have aa prop associated'
, '\n===============================================================') , '\n===============================================================')
else: else:
print('FAIL: NAs detected i.e some muts remain unclassified' print('FAIL: NAs detected. Some muts remain unclassified'
, '\n===============================================================') , '\n===============================================================')
sys.exit()
df['wt_calcprop'].head() df['wt_calcprop'].head()
df['mut_calcprop'].head() df['mut_calcprop'].head()
@ -109,7 +131,7 @@ mut_categ = df["aa_calcprop_combined"].unique()
print('Total no. of aa_calc properties: ', len(mut_categ)) print('Total no. of aa_calc properties: ', len(mut_categ))
print('Categories are: ', mut_categ) print('Categories are: ', mut_categ)
# counting no. of muts in each mut categ # Counting no. of muts in each mut categ
# way1: count values within each combinaton # way1: count values within each combinaton
df.groupby('aa_calcprop_combined').size() df.groupby('aa_calcprop_combined').size()
@ -137,31 +159,47 @@ print('Total no.of muts with elec changes: ', elec_count)
# calculate percentage of electrostatic changes # calculate percentage of electrostatic changes
elec_changes = (elec_count/total_muts) * 100 elec_changes = (elec_count/total_muts) * 100
print('Total number of electrostatic changes resulting from Mutation is (%):', elec_changes) print('\n==============================================================='
, '\nResults for electrostatic changes from nsSNPs for', gene,':'
, '\n\nTotal no. of mutations:', total_muts
, '\nMutations with electrostatic changes:', elec_count
, '\nPercentage(%) of electrostatic changes:', elec_changes
, '\n===============================================================')
# check no change muts # check no change muts
no_change_muts = ap_df[ap_df['aa_calcprop'].isin(['neg->neg','non-polar->non-polar','polar->polar', 'pos->pos']) == True] no_change_muts = ap_df[ap_df['aa_calcprop'].isin(['neg->neg','non-polar->non-polar','polar->polar', 'pos->pos']) == True]
no_change_muts.mut_count.sum() no_change_muts.mut_count.sum()
if no_change_muts.mut_count.sum() + elec_count == total_muts:
print('\nPASS: numbers cross checked'
, '\nWriting output file:', outfile_elec_changes)
else:
print('\nFAIL: numbers mismatch')
sys.exit()
#%% output from console #%% output from console
#sys.stdout = open(file, 'w') #sys.stdout = open(file, 'w')
sys.stdout = open(outfile, 'w') sys.stdout = open(outfile_elec_changes, 'w')
print('======================\n' print('################################################'
,'Unchanged muts' , '\nResults for gene:', gene
,'\n=====================\n' , '\n################################################'
, no_change_muts , '\n\n======================'
,'\n=============================\n' , '\nUnchanged muts:'
, 'Muts with changed prop:' , '\n=====================\n'
, no_change_muts
, '\n=============================\n'
, 'Muts with changed properties:'
, '\n============================\n' , '\n============================\n'
, all_prop_change) , all_prop_change
, '\n=====================================================')
print('=================================================================' print('Total no. of mutations: ', total_muts
, '\nTotal number of electrostatic changes resulting from Mtation is (%):', elec_changes , '\nTotal no. of mutions with electrostatic changes:', elec_count
, '\nTotal no. of muts: ', total_muts , '\nCorresponding (%):', elec_changes
, '\nTotal no. of changed muts: ', all_prop_change.mut_count.sum() , '\nTotal no. of changed muts: ', all_prop_change.mut_count.sum()
, '\nTotal no. of unchanged muts: ', no_change_muts.mut_count.sum() , '\nTotal no. of unchanged muts: ', no_change_muts.mut_count.sum()
, '\n===================================================================') , '\n=======================================================')
#%% end of script #%% end of script
#======================================================================= #=======================================================================