#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Wed Aug 19 14:33:51 2020 @author: tanu """ #%% load packages import os,sys import subprocess import argparse import requests import re import time from bs4 import BeautifulSoup import pandas as pd import numpy as np from pandas.api.types import is_string_dtype from pandas.api.types import is_numeric_dtype #%%##################################################################### def format_dynamut2_output(dynamut_output_csv): """ @param dynamut_output_csv: file containing dynamut2 results for all muts which is the result of combining all dynamut2_output batch results, and using bash scripts to combine all the batch results into one file. Dynamut2ran manually from batches Formatting df to a pandas df and output as csv. @type string @return (not true) formatted csv for dynamut output @type pandas df """ ############# # Read file ############# dynamut_data_raw = pd.read_csv(dynamut_output_csv, sep = ',') # strip white space from both ends in all columns dynamut_data = dynamut_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x) dforig_shape = dynamut_data.shape print('dimensions of input file:', dforig_shape) #%%============================================================================ ##################################### # create binary cols for ddg_dynamut2 # >=0: Stabilising ###################################### outcome_cols = ['ddg_dynamut2'] # col test: ddg_dynamut #len(dynamut_data[dynamut_data['ddg_dynamut'] >= 0]) #dynamut_data['ddg_dynamut_outcome'] = dynamut_data['ddg_dynamut'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising') #len(dynamut_data[dynamut_data['ddg_dynamut_outcome'] == 'Stabilising']) print('\nCreating classification cols for', len(outcome_cols), 'columns' , '\nThese are:') for cols in outcome_cols: print(cols) tot_muts = dynamut_data[cols].count() print('\nTotal entries:', tot_muts) outcome_colname = cols + '_outcome' print(cols, ':', outcome_colname) c1 = len(dynamut_data[dynamut_data[cols] >= 0]) dynamut_data[outcome_colname] = dynamut_data[cols].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising') c2 = len(dynamut_data[dynamut_data[outcome_colname] == 'Stabilising']) if c1 == c2: print('\nPASS: outcome classification column created successfully' , '\nColumn created:', outcome_colname #, '\nNo. of stabilising muts: ', c1 #, '\nNo. of DEstabilising muts: ', tot_muts-c1 , '\n\nCateg counts:\n', dynamut_data[outcome_colname].value_counts() ) else: print('\nFAIL: outcome classification numbers MISmatch' , '\nexpected length:', c1 , '\nGot:', c2) #%%===================================================================== ################################ # scale all ddg_dynamut2 values ################################# # Rescale values in all ddg_dynamut2 col col b/w -1 and 1 so negative numbers # stay neg and pos numbers stay positive outcome_cols = ['ddg_dynamut2'] for cols in outcome_cols: #print(cols) col_max = dynamut_data[cols].max() col_min = dynamut_data[cols].min() print( '\n====================' , '\nColname:', cols , '\n====================' , '\nMax: ', col_max , '\nMin: ', col_min) scaled_colname = cols + '_scaled' print('\nCreated scaled colname for', cols, ':', scaled_colname) col_scale = lambda x : x/abs(col_min) if x < 0 else (x/col_max if x >= 0 else 'failed') dynamut_data[scaled_colname] = dynamut_data[cols].apply(col_scale) col_scaled_max = dynamut_data[scaled_colname].max() col_scaled_min = dynamut_data[scaled_colname].min() print( '\n====================' , '\nColname:', scaled_colname , '\n====================' , '\nMax: ', col_scaled_max , '\nMin: ', col_scaled_min) #%%===================================================================== ############# # reorder columns ############# dynamut_data.columns dynamut_data_f = dynamut_data[['mutationinformation' , 'chain' , 'ddg_dynamut2' , 'ddg_dynamut2_scaled' , 'ddg_dynamut2_outcome']] if len(dynamut_data.columns) == len(dynamut_data_f.columns) and sorted(dynamut_data.columns) == sorted(dynamut_data_f.columns): print('\nPASS: outcome_classification, scaling and column reordering completed') else: print('\nFAIL: Something went wrong...' , '\nExpected length: ', len(dynamut_data.columns) , '\nGot: ', len(dynamut_data_f.columns)) sys.exit() return(dynamut_data_f) #%%#####################################################################