#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Wed Aug 19 14:33:51 2020 @author: tanu """ #%% load packages import os,sys import subprocess import argparse import requests import re import time from bs4 import BeautifulSoup import pandas as pd import numpy as np from pandas.api.types import is_string_dtype from pandas.api.types import is_numeric_dtype #%%##################################################################### def format_mcsm_na_output(mcsm_na_output_tsv): """ @param mcsm_na_outputcsv: file containing mcsm_na_results for all muts which is the result of combining all mcsm_na batch results, and using bash scripts to combine all the batch results into one file. This is post run_get_results_mcsm_na.py Formatting df to a pandas df and output as csv. @type string @return (not true) formatted csv for mcsm_na output @type pandas df """ ############# # Read file ############# mcsm_na_data_raw = pd.read_csv(mcsm_na_output_tsv, sep = '\t') # strip white space from both ends in all columns mcsm_na_data = mcsm_na_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x) dforig_shape = mcsm_na_data.shape print('dimensions of input file:', dforig_shape) ############# # rename cols ############# # format colnames: all lowercase and consistent colnames mcsm_na_data.columns print('Assigning meaningful colnames' , '\n=======================================================') my_colnames_dict = {'PDB_FILE': 'pdb_file' # relevant info from this col will be extracted and the column discarded , 'CHAIN': 'chain' # {wild_type}{mutant_type} , 'WILD_RES': 'wild_type' # one letter amino acid code , 'RES_POS': 'position' # number , 'MUT_RES': 'mutant_type' # one letter amino acid code , 'RSA': 'rsa' # single letter (caps) , 'PRED_DDG': 'mcsm_na_affinity'} # 3-letter code mcsm_na_data.rename(columns = my_colnames_dict, inplace = True) mcsm_na_data.columns #%%============================================================================ ############# # create mutationinformation column ############# mcsm_na_data['mutationinformation'] = mcsm_na_data['wild_type'] + mcsm_na_data.position.map(str) + mcsm_na_data['mutant_type'] #%%===================================================================== ############# # Create col: mcsm_na_outcome ############# # classification based on mcsm_na_affinity values print('Assigning col: mcsm_na_outcome based on mcsm_na_affinity') print('Sanity check:') # count positive values in the mcsm_na_affinity column c = mcsm_na_data[mcsm_na_data['mcsm_na_affinity']>=0].count() mcsm_na_pos = c.get(key = 'mcsm_na_affinity') # Assign category based on sign (+ve : I_affinity, -ve: R_affinity) mcsm_na_data['mcsm_na_outcome'] = np.where(mcsm_na_data['mcsm_na_affinity']>=0, 'Increased_affinity', 'Reduced_affinity') print('mcsm_na Outcome:', mcsm_na_data['mcsm_na_outcome'].value_counts()) #if mcsm_na_pos == mcsm_na_data['mcsm_na_outcome'].value_counts()['Increased_affinity']: # print('PASS: mcsm_na_outcome assigned correctly') #else: # print('FAIL: mcsm_na_outcome assigned incorrectly' # , '\nExpected no. of Increased_affinity mutations:', mcsm_na_pos # , '\nGot no. of Increased affinity mutations', mcsm_na_data['mcsm_na_outcome'].value_counts()['Increased_affinity'] # , '\n======================================================') #%%===================================================================== ############# # scale mcsm_na values ############# # Rescale values in mcsm_na_affinity col b/w -1 and 1 so negative numbers # stay neg and pos numbers stay positive mcsm_na_min = mcsm_na_data['mcsm_na_affinity'].min() mcsm_na_max = mcsm_na_data['mcsm_na_affinity'].max() mcsm_na_scale = lambda x : x/abs(mcsm_na_min) if x < 0 else (x/mcsm_na_max if x >= 0 else 'failed') mcsm_na_data['mcsm_na_scaled'] = mcsm_na_data['mcsm_na_affinity'].apply(mcsm_na_scale) print('Raw mcsm_na scores:\n', mcsm_na_data['mcsm_na_affinity'] , '\n---------------------------------------------------------------' , '\nScaled mcsm_na scores:\n', mcsm_na_data['mcsm_na_scaled']) c2 = mcsm_na_data[mcsm_na_data['mcsm_na_scaled']>=0].count() mcsm_na_pos2 = c2.get(key = 'mcsm_na_affinity') if mcsm_na_pos == mcsm_na_pos2: print('\nPASS: Affinity values scaled correctly') else: print('\nFAIL: Affinity values scaled numbers MISmatch' , '\nExpected number:', mcsm_na_pos , '\nGot:', mcsm_na_pos2 , '\n======================================================') #%%===================================================================== ############# # reorder columns ############# mcsm_na_data.columns mcsm_na_dataf = mcsm_na_data[['mutationinformation' , 'mcsm_na_affinity' , 'mcsm_na_scaled' , 'mcsm_na_outcome' , 'rsa' , 'wild_type' , 'position' , 'mutant_type' , 'chain' , 'pdb_file']] return(mcsm_na_dataf) #%%#####################################################################