#!/usr/bin/env python3 # -*- coding: utf-8 -*- ''' Created on Tue Aug 6 12:56:03 2019 @author: tanu ''' # FIXME: import dirs.py to get the basic dir paths available #======================================================================= # TASK: calculate how many mutations result in # electrostatic changes wrt wt # Input: mcsm and AF_OR file # Output: mut_elec_changes_results.txt #======================================================================= #%% load libraries import os, sys import pandas as pd #import numpy as np #%% specify homedir as python doesn't recognise tilde homedir = os.path.expanduser('~') # set working dir os.getcwd() os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis') os.getcwd() #======================================================================= #%% variable assignment: input and output paths & filenames drug = 'pyrazinamide' gene = 'pncA' gene_match = gene + '_p.' #========== # data dir #========== #indir = 'git/Data/pyrazinamide/input/original' datadir = homedir + '/' + 'git/Data' #======= # input #======= indir = datadir + '/' + drug + '/' + 'input' in_filename = 'merged_df3.csv' infile = outdir + '/' + in_filename print('Input filename: ', in_filename , '\nInput path: ', indir) #======= # output #======= outdir = datadir + '/' + drug + '/' + 'output' # specify output file out_filename = 'mut_elec_changes.txt' outfile = outdir + '/' + out_filename print('Output filename: ', out_filename , '\nOutput path: ', outdir) #%% end of variable assignment for input and output files #======================================================================= #%% Read input files #in_filename = gene.lower() + '_meta_data_with_AFandOR.csv' print('Reading input file (merged file):', infile) comb_df = pd.read_csv(infile, sep = ',') print('Input filename: ', in_filename, '\nPath :', outdir, '\nNo. of rows: ', len(comb_df), '\nNo. of cols: ', infile) # column names list(comb_df.columns) # clear variables del(in_filename, infile) #%% subset unique mutations df = comb_df.drop_duplicates(['Mutationinformation'], keep = 'first') total_muts = df.Mutationinformation.nunique() #df.Mutationinformation.count() print('Total mutations associated with structure: ', total_muts) #%% combine aa_calcprop cols so that you can count the changes as value_counts # check if all muts have been categorised print('Checking if all muts have been categorised: ') if df['wt_calcprop'].isna().sum() == 0 & df['mut_calcprop'].isna().sum(): print('PASS: No. NA detected i.e all muts have aa prop associated') else: print('FAIL: NAs detected i.e some muts remain unclassified') df['wt_calcprop'].head() df['mut_calcprop'].head() print('Combining wt_calcprop and mut_calcprop...') #df['aa_calcprop_combined'] = df['wt_calcprop']+ '->' + df['mut_calcprop'] df['aa_calcprop_combined'] = df.wt_calcprop.str.cat(df.mut_calcprop, sep = '->') df['aa_calcprop_combined'].head() mut_categ = df["aa_calcprop_combined"].unique() print('Total no. of aa_calc properties: ', len(mut_categ)) print('Categories are: ', mut_categ) # counting no. of muts in each mut categ # way1: count values within each combinaton df.groupby('aa_calcprop_combined').size() #df.groupby('aa_calcprop_combined').count() # way2: count values within each combinaton df['aa_calcprop_combined'].value_counts() # comment: the two ways should be identical # groupby result order is similar to pivot table order, # I prefer the value_counts look # assign to variable: count values within each combinaton all_prop = df['aa_calcprop_combined'].value_counts() # convert to a df from Series ap_df = pd.DataFrame({'aa_calcprop': all_prop.index, 'mut_count': all_prop.values}) # subset df to contain only the changes in prop all_prop_change = ap_df[ap_df['aa_calcprop'].isin(['neg->neg','non-polar->non-polar','polar->polar', 'pos->pos']) == False] elec_count = all_prop_change.mut_count.sum() print('Total no.of muts with elec changes: ', elec_count) # calculate percentage of electrostatic changes elec_changes = (elec_count/total_muts) * 100 print('Total number of electrostatic changes resulting from Mutation is (%):', elec_changes) # check no change muts no_change_muts = ap_df[ap_df['aa_calcprop'].isin(['neg->neg','non-polar->non-polar','polar->polar', 'pos->pos']) == True] no_change_muts.mut_count.sum() ########### # Step 5: output from console ########### #sys.stdout = open(file, 'w') sys.stdout = open(outfile, 'w') #print(no_change_muts, '\n', # all_prop_change) print('======================\n' ,'Unchanged muts' ,'\n=====================\n' , no_change_muts ,'\n=============================\n' , 'Muts with changed prop:' , '\n============================\n' , all_prop_change) #print('======================================================================') #print('Total number of electrostatic changes resulting from Mutation is (%):', elec_changes) #print('Total no. of muts: ', total_muts) #print('Total no. of changed muts: ', all_prop_change.mut_count.sum()) #print('Total no. of unchanged muts: ', no_change_muts.mut_count.sum() ) #print('=======================================================================') print('========================================================================' , '\nTotal number of electrostatic changes resulting from Mtation is (%):', elec_changes , '\nTotal no. of muts: ', total_muts , '\nTotal no. of changed muts: ', all_prop_change.mut_count.sum() , '\nTotal no. of unchanged muts: ', no_change_muts.mut_count.sum() , '\n=========================================================================')