#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Wed Aug 19 14:33:51 2020 @author: tanu """ #%% load packages import os,sys homedir = os.path.expanduser('~') import subprocess import argparse import requests import re import time from bs4 import BeautifulSoup import pandas as pd import numpy as np from pandas.api.types import is_string_dtype from pandas.api.types import is_numeric_dtype sys.path.append(homedir + '/git/LSHTM_analysis/scripts') from reference_dict import up_3letter_aa_dict from reference_dict import oneletter_aa_dict #%%##################################################################### def format_mcsm_ppi2_output(mcsm_ppi2_output_csv): """ @param mcsm_ppi2_output_csv: file containing mcsm_ppi2_results for all muts which is the result of combining all mcsm_ppi2 batch results, and using bash scripts to combine all the batch results into one file. Formatting df to a pandas df and output as csv. @type string @return (not true) formatted csv for mcsm_ppi2 output @type pandas df """ ############# # Read file ############# mcsm_ppi2_data_raw = pd.read_csv(mcsm_ppi2_output_csv, sep = ',') # strip white space from both ends in all columns mcsm_ppi2_data = mcsm_ppi2_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x) dforig_shape = mcsm_ppi2_data.shape print('dimensions of input file:', dforig_shape) ############# # Map 3 letter # code to one ############# # initialise a sub dict that is lookup dict for # 3-LETTER aa code to 1-LETTER aa code lookup_dict = dict() for k, v in up_3letter_aa_dict.items(): lookup_dict[k] = v['one_letter_code'] wt = mcsm_ppi2_data['wild-type'].squeeze() # converts to a series that map works on mcsm_ppi2_data['w_type'] = wt.map(lookup_dict) mut = mcsm_ppi2_data['mutant'].squeeze() mcsm_ppi2_data['m_type'] = mut.map(lookup_dict) # ############# # # CHECK # # Map 1 letter # # code to 3Upper # ############# # # initialise a sub dict that is lookup dict for # # 3-LETTER aa code to 1-LETTER aa code # lookup_dict = dict() # for k, v in oneletter_aa_dict.items(): # lookup_dict[k] = v['three_letter_code_upper'] # wt = mcsm_ppi2_data['w_type'].squeeze() #converts to a series that map works on # mcsm_ppi2_data['WILD'] = wt.map(lookup_dict) # mut = mcsm_ppi2_data['m_type'].squeeze() # mcsm_ppi2_data['MUT'] = mut.map(lookup_dict) # # check # mcsm_ppi2_data['wild-type'].equals(mcsm_ppi2_data['WILD']) # mcsm_ppi2_data['mutant'].equals(mcsm_ppi2_data['MUT']) #%%============================================================================ ############# # rename cols ############# # format colnames: all lowercase and consistent colnames mcsm_ppi2_data.columns print('Assigning meaningful colnames' , '\n=======================================================') my_colnames_dict = {'chain': 'chain' , 'wild-type': 'wt_upper' , 'res-number': 'position' , 'mutant': 'mut_upper' , 'distance-to-interface': 'interface_dist' , 'mcsm-ppi2-prediction': 'mcsm_ppi2_affinity' , 'affinity': 'mcsm_ppi2_outcome' , 'w_type': 'wild_type' # one letter amino acid code , 'm_type': 'mutant_type' # one letter amino acid code } mcsm_ppi2_data.rename(columns = my_colnames_dict, inplace = True) mcsm_ppi2_data.columns ############# # create mutationinformation column ############# #mcsm_ppi2_data['mutationinformation'] = mcsm_ppi2_data['wild_type'] + mcsm_ppi2_data.position.map(str) + mcsm_ppi2_data['mutant_type'] mcsm_ppi2_data['mutationinformation'] = mcsm_ppi2_data.loc[:,'wild_type'] + mcsm_ppi2_data.loc[:,'position'].astype(int).apply(str) + mcsm_ppi2_data.loc[:,'mutant_type'] #%%===================================================================== ######################### # scale mcsm_ppi2 values ######################### # Rescale values in mcsm_ppi2_affinity col b/w -1 and 1 so negative numbers # stay neg and pos numbers stay positive mcsm_ppi2_min = mcsm_ppi2_data['mcsm_ppi2_affinity'].min() mcsm_ppi2_max = mcsm_ppi2_data['mcsm_ppi2_affinity'].max() mcsm_ppi2_scale = lambda x : x/abs(mcsm_ppi2_min) if x < 0 else (x/mcsm_ppi2_max if x >= 0 else 'failed') mcsm_ppi2_data['mcsm_ppi2_scaled'] = mcsm_ppi2_data['mcsm_ppi2_affinity'].apply(mcsm_ppi2_scale) print('Raw mcsm_ppi2 scores:\n', mcsm_ppi2_data['mcsm_ppi2_affinity'] , '\n---------------------------------------------------------------' , '\nScaled mcsm_ppi2 scores:\n', mcsm_ppi2_data['mcsm_ppi2_scaled']) c = mcsm_ppi2_data[mcsm_ppi2_data['mcsm_ppi2_affinity']>=0].count() mcsm_ppi2_pos = c.get(key = 'mcsm_ppi2_affinity') c2 = mcsm_ppi2_data[mcsm_ppi2_data['mcsm_ppi2_scaled']>=0].count() mcsm_ppi2_pos2 = c2.get(key = 'mcsm_ppi2_scaled') if mcsm_ppi2_pos == mcsm_ppi2_pos2: print('\nPASS: Affinity values scaled correctly') else: print('\nFAIL: Affinity values scaled numbers MISmatch' , '\nExpected number:', mcsm_ppi2_pos , '\nGot:', mcsm_ppi2_pos2 , '\n======================================================') #%%===================================================================== ############# # reorder columns ############# mcsm_ppi2_data.columns mcsm_ppi2_dataf = mcsm_ppi2_data[['mutationinformation' , 'mcsm_ppi2_affinity' , 'mcsm_ppi2_scaled' , 'mcsm_ppi2_outcome' , 'interface_dist' , 'wild_type' , 'position' , 'mutant_type' , 'wt_upper' , 'mut_upper' , 'chain']] return(mcsm_ppi2_dataf) #%%#####################################################################