#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Tue Aug 6 12:56:03 2019 @author: tanu """ # FIXME: include error checking to enure you only # concentrate on positions that have structural info #%% load libraries ################### # load libraries import os, sys import pandas as pd #import numpy as np #from pandas.api.types import is_string_dtype #from pandas.api.types import is_numeric_dtype #==================================================== # TASK: calculate how many mutations result in # electrostatic changes wrt wt # Input: mcsm and AF_OR file # output: mut_elec_changes_results.txt #======================================================== #%% #################### # my working dir os.getcwd() homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis') os.getcwd() #%% from reference_dict import my_aa_dict #CHECK DIR STRUC THERE! #%% ############# specify variables for input and output paths and filenames drug = "pyrazinamide" gene = "pnca" datadir = homedir + "/git/Data" basedir = datadir + "/" + drug + "/input" # input inpath = "/processed" # uncomment as necessary in_filename = "/meta_data_with_AFandOR.csv" #in_filename = "/mcsm_complex1_normalised.csv" # probably simpler infile = basedir + inpath + in_filename #print(infile) # output file outpath = "/output" outdir = datadir + "/" + drug + outpath out_filename = "/mut_elec_changes_results.txt" outfile = outdir + out_filename #print(outdir) if not os.path.exists(datadir): print('Error!', datadir, 'does not exist. Please ensure it exists. Dir struc specified in README.md') os.makedirs(datadir) exit() if not os.path.exists(outdir): print('Error!', outdir, 'does not exist.Please ensure it exists. Dir struc specified in README.md') exit() else: print('Dir exists: Carrying on') ################## end of variable assignment for input and output files #%% #============================================================================== ############ # STEP 1: Read file ############ meta_pnca = pd.read_csv(infile, sep = ',') # column names list(meta_pnca.columns) #======== # Step 2: iterate through the dict, create a lookup dict that i.e # lookup_dict = {three_letter_code: aa_prop_polarity} # Do this for both wild_type and mutant as above. #========= # initialise a sub dict that is lookup dict for three letter code to aa prop lookup_dict = dict() for k, v in my_aa_dict.items(): lookup_dict[k] = v['aa_calcprop'] #print(lookup_dict) wt = meta_pnca['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on meta_pnca['wt_calcprop'] = wt.map(lookup_dict) mut = meta_pnca['mutation'].str.extract(r'\d+(\w{3})$').squeeze() meta_pnca['mut_calcprop'] = mut.map(lookup_dict) # added two more cols # clear variables del(k, v, wt, mut, lookup_dict) del(in_filename, infile, inpath) #%% ########### # Step 3: subset unique mutations ########### meta_pnca_muts = meta_pnca.drop_duplicates(['Mutationinformation'], keep = 'first') non_struc = meta_pnca_muts[meta_pnca_muts.position == 186] # remove pos non_struc 186 : (in case you used file with AF and OR) df = meta_pnca_muts[meta_pnca_muts.position != 186] total_muts = df.Mutationinformation.nunique() #df.Mutationinformation.count() ########### # Step 4: combine cols ########### df['aa_calcprop_combined'] = df['wt_calcprop']+ '->' + df['mut_calcprop'] df['aa_calcprop_combined'] # way1: count values within each combinaton df.groupby('aa_calcprop_combined').size() #df.groupby('aa_calcprop_combined').count() # way2: count values within each combinaton #df['aa_calcprop_combined'].value_counts() # comment: the two ways should be identical # groupby result order is similar to pivot table order #assign to variable: count values within each combinaton all_prop = df.groupby('aa_calcprop_combined').size() # convert to a df from Series ap_df = pd.DataFrame({'aa_calcprop': all_prop.index, 'mut_count': all_prop.values}) # subset df to contain only the changes in prop all_prop_change = ap_df[ap_df['aa_calcprop'].isin(['neg->neg','non-polar->non-polar','polar->polar', 'pos->pos']) == False] elec_count = all_prop_change.mut_count.sum() # calculate percentage of electrostatic changes elec_changes = (elec_count/total_muts) * 100 print("Total number of electrostatic changes resulting from Mutation is (%):", elec_changes) ########### # Step 5: output from console ########### #sys.stdout = open(file, 'w') sys.stdout = open(outfile, 'w') print(df.groupby('aa_calcprop_combined').size() ) print("=======================================================================================") print("Total number of electrostatic changes resulting from Mutation is (%):", elec_changes) print("=======================================================================================")