From c124f49041fe31a6d39f230cf1dd226ca15159cf Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 10 Sep 2020 20:18:35 +0100 Subject: [PATCH] add scripts/mut_electrostatic_changes.py --- scripts/mut_electrostatic_changes.py | 167 +++++++++++++++++++++++++++ 1 file changed, 167 insertions(+) create mode 100755 scripts/mut_electrostatic_changes.py diff --git a/scripts/mut_electrostatic_changes.py b/scripts/mut_electrostatic_changes.py new file mode 100755 index 0000000..20ea86a --- /dev/null +++ b/scripts/mut_electrostatic_changes.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +''' +Created on Tue Aug 6 12:56:03 2019 + +@author: tanu +''' + +# FIXME: import dirs.py to get the basic dir paths available +#======================================================================= +# TASK: calculate how many mutations result in +# electrostatic changes wrt wt + +# Input: mcsm and AF_OR file + +# Output: mut_elec_changes_results.txt +#======================================================================= +#%% load libraries +import os, sys +import pandas as pd +#import numpy as np +#======================================================================= +#%% specify homedir and curr dir +homedir = os.path.expanduser('~') + +# set working dir +os.getcwd() +os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis') +os.getcwd() +#======================================================================= +#%% variable assignment: input and output +drug = 'pyrazinamide' +gene = 'pncA' +gene_match = gene + '_p.' + +#========== +# data dir +#========== +#indir = 'git/Data/pyrazinamide/input/original' +datadir = homedir + '/' + 'git/Data' + +#======= +# input +#======= +indir = datadir + '/' + drug + '/' + 'input' +in_filename = 'merged_df3.csv' +infile = outdir + '/' + in_filename +print('Input filename: ', in_filename + , '\nInput path: ', indir + , '\n============================================================') + +#======= +# output +#======= +outdir = datadir + '/' + drug + '/' + 'output' +# specify output file +out_filename = 'mut_elec_changes.txt' +outfile = outdir + '/' + out_filename +print('Output filename: ', out_filename + , '\nOutput path: ', outdir + , '\n============================================================') + +#%% end of variable assignment for input and output files +#======================================================================= +#%% Read input files +print('Reading input file (merged file):', infile) + +comb_df = pd.read_csv(infile, sep = ',') + +print('Input filename: ', in_filename + , '\nPath :', outdir + , '\nNo. of rows: ', len(comb_df) + , '\nNo. of cols: ', infile + , '\n============================================================') + +# column names +list(comb_df.columns) + +# clear variables +del(in_filename, infile) + +#%% subset unique mutations +df = comb_df.drop_duplicates(['Mutationinformation'], keep = 'first') + +total_muts = df.Mutationinformation.nunique() +#df.Mutationinformation.count() +print('Total mutations associated with structure: ', total_muts + , '\n===============================================================') + +#%% combine aa_calcprop cols so that you can count the changes as value_counts +# check if all muts have been categorised +print('Checking if all muts have been categorised: ') +if df['wt_calcprop'].isna().sum() == 0 & df['mut_calcprop'].isna().sum(): + print('PASS: No. NA detected i.e all muts have aa prop associated' + , '\n===============================================================') +else: + print('FAIL: NAs detected i.e some muts remain unclassified' + , '\n===============================================================') + +df['wt_calcprop'].head() +df['mut_calcprop'].head() + +print('Combining wt_calcprop and mut_calcprop...') +#df['aa_calcprop_combined'] = df['wt_calcprop']+ '->' + df['mut_calcprop'] +df['aa_calcprop_combined'] = df.wt_calcprop.str.cat(df.mut_calcprop, sep = '->') +df['aa_calcprop_combined'].head() + +mut_categ = df["aa_calcprop_combined"].unique() +print('Total no. of aa_calc properties: ', len(mut_categ)) +print('Categories are: ', mut_categ) + +# counting no. of muts in each mut categ + +# way1: count values within each combinaton +df.groupby('aa_calcprop_combined').size() +#df.groupby('aa_calcprop_combined').count() + +# way2: count values within each combinaton +df['aa_calcprop_combined'].value_counts() + +# comment: the two ways should be identical +# groupby result order is similar to pivot table order, +# I prefer the value_counts look + +# assign to variable: count values within each combinaton +all_prop = df['aa_calcprop_combined'].value_counts() + +# convert to a df from Series +ap_df = pd.DataFrame({'aa_calcprop': all_prop.index, 'mut_count': all_prop.values}) + +# subset df to contain only the changes in prop +all_prop_change = ap_df[ap_df['aa_calcprop'].isin(['neg->neg','non-polar->non-polar','polar->polar', 'pos->pos']) == False] + +elec_count = all_prop_change.mut_count.sum() +print('Total no.of muts with elec changes: ', elec_count) + +# calculate percentage of electrostatic changes +elec_changes = (elec_count/total_muts) * 100 + +print('Total number of electrostatic changes resulting from Mutation is (%):', elec_changes) + +# check no change muts +no_change_muts = ap_df[ap_df['aa_calcprop'].isin(['neg->neg','non-polar->non-polar','polar->polar', 'pos->pos']) == True] + +no_change_muts.mut_count.sum() + +#%% output from console +#sys.stdout = open(file, 'w') +sys.stdout = open(outfile, 'w') + +print('======================\n' + ,'Unchanged muts' + ,'\n=====================\n' + , no_change_muts + ,'\n=============================\n' + , 'Muts with changed prop:' + , '\n============================\n' + , all_prop_change) + +print('=================================================================' +, '\nTotal number of electrostatic changes resulting from Mtation is (%):', elec_changes +, '\nTotal no. of muts: ', total_muts +, '\nTotal no. of changed muts: ', all_prop_change.mut_count.sum() +, '\nTotal no. of unchanged muts: ', no_change_muts.mut_count.sum() +, '\n===================================================================') +#%% end of script +#=======================================================================