From 9e4b3c5dce67cb0e17d6090484e6cd6e6fb02404 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Tue, 11 Feb 2020 15:03:21 +0000 Subject: [PATCH] added script to calculate electrostatic changes of mutations --- .../mut_electrostatic_changes.py | 162 ++++++++++++++++++ 1 file changed, 162 insertions(+) create mode 100755 meta_data_analysis/mut_electrostatic_changes.py diff --git a/meta_data_analysis/mut_electrostatic_changes.py b/meta_data_analysis/mut_electrostatic_changes.py new file mode 100755 index 0000000..0453649 --- /dev/null +++ b/meta_data_analysis/mut_electrostatic_changes.py @@ -0,0 +1,162 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Aug 6 12:56:03 2019 + +@author: tanu +""" + +# FIXME: include error checking to enure you only +# concentrate on positions that have structural info + +#%% load libraries +################### +# load libraries +import os, sys +import pandas as pd +#import numpy as np + +#from pandas.api.types import is_string_dtype +#from pandas.api.types import is_numeric_dtype + +#==================================================== +# TASK: calculate how many mutations result in +# electrostatic changes wrt wt +# Input: mcsm and AF_OR file +# output: mut_elec_changes_results.txt +#======================================================== +#%% +#################### +# my working dir +os.getcwd() +homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde +os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis') +os.getcwd() +#%% +from reference_dict import my_aa_dict #CHECK DIR STRUC THERE! +#%% +############# specify variables for input and output paths and filenames +drug = "pyrazinamide" +gene = "pnca" + +datadir = homedir + "/git/Data" +basedir = datadir + "/" + drug + "/input" + +# input +inpath = "/processed" + +# uncomment as necessary +in_filename = "/meta_data_with_AFandOR.csv" +#in_filename = "/mcsm_complex1_normalised.csv" # probably simpler + +infile = basedir + inpath + in_filename +#print(infile) + +# output file +outpath = "/output" +outdir = datadir + "/" + drug + outpath +out_filename = "/mut_elec_changes_results.txt" +outfile = outdir + out_filename + +#print(outdir) + +if not os.path.exists(datadir): + print('Error!', datadir, 'does not exist. Please ensure it exists. Dir struc specified in README.md') + os.makedirs(datadir) + exit() + +if not os.path.exists(outdir): + print('Error!', outdir, 'does not exist.Please ensure it exists. Dir struc specified in README.md') + exit() + +else: + print('Dir exists: Carrying on') + +################## end of variable assignment for input and output files +#%% +#============================================================================== +############ +# STEP 1: Read file +############ +meta_pnca = pd.read_csv(infile, sep = ',') + +# column names +list(meta_pnca.columns) + +#======== +# Step 2: iterate through the dict, create a lookup dict that i.e +# lookup_dict = {three_letter_code: aa_prop_polarity} +# Do this for both wild_type and mutant as above. +#========= +# initialise a sub dict that is lookup dict for three letter code to aa prop +lookup_dict = dict() + +for k, v in my_aa_dict.items(): + lookup_dict[k] = v['aa_calcprop'] + #print(lookup_dict) + wt = meta_pnca['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on + meta_pnca['wt_calcprop'] = wt.map(lookup_dict) + mut = meta_pnca['mutation'].str.extract(r'\d+(\w{3})$').squeeze() + meta_pnca['mut_calcprop'] = mut.map(lookup_dict) + +# added two more cols + +# clear variables +del(k, v, wt, mut, lookup_dict) +del(in_filename, infile, inpath) + +#%% +########### +# Step 3: subset unique mutations +########### +meta_pnca_muts = meta_pnca.drop_duplicates(['Mutationinformation'], keep = 'first') +non_struc = meta_pnca_muts[meta_pnca_muts.position == 186] + +# remove pos non_struc 186 : (in case you used file with AF and OR) +df = meta_pnca_muts[meta_pnca_muts.position != 186] +total_muts = df.Mutationinformation.nunique() +#df.Mutationinformation.count() + +########### +# Step 4: combine cols +########### + +df['aa_calcprop_combined'] = df['wt_calcprop']+ '->' + df['mut_calcprop'] +df['aa_calcprop_combined'] + +# way1: count values within each combinaton +df.groupby('aa_calcprop_combined').size() +#df.groupby('aa_calcprop_combined').count() + +# way2: count values within each combinaton +#df['aa_calcprop_combined'].value_counts() + +# comment: the two ways should be identical +# groupby result order is similar to pivot table order + +#assign to variable: count values within each combinaton +all_prop = df.groupby('aa_calcprop_combined').size() + +# convert to a df from Series +ap_df = pd.DataFrame({'aa_calcprop': all_prop.index, 'mut_count': all_prop.values}) + +# subset df to contain only the changes in prop +all_prop_change = ap_df[ap_df['aa_calcprop'].isin(['neg->neg','non-polar->non-polar','polar->polar', 'pos->pos']) == False] + +elec_count = all_prop_change.mut_count.sum() + +# calculate percentage of electrostatic changes +elec_changes = (elec_count/total_muts) * 100 + +print("Total number of electrostatic changes resulting from Mutation is (%):", elec_changes) + +########### +# Step 5: output from console +########### +#sys.stdout = open(file, 'w') +sys.stdout = open(outfile, 'w') + +print(df.groupby('aa_calcprop_combined').size() ) +print("=======================================================================================") +print("Total number of electrostatic changes resulting from Mutation is (%):", elec_changes) +print("=======================================================================================") \ No newline at end of file