#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Wed Aug 19 14:33:51 2020 @author: tanu """ #%% load packages import os,sys import subprocess import argparse import requests import re import time from bs4 import BeautifulSoup import pandas as pd import numpy as np from pandas.api.types import is_string_dtype from pandas.api.types import is_numeric_dtype #%%##################################################################### def format_dynamut_output(dynamut_output_csv): """ @param dynamut_output_csv: file containing dynamut results for all muts which is the result of combining all dynamut_output batch results, and using bash scripts to combine all the batch results into one file. This is post run_get_results_dynamut.py Formatting df to a pandas df and output as csv. @type string @return (not true) formatted csv for dynamut output @type pandas df """ ############# # Read file ############# dynamut_data_raw = pd.read_csv(dynamut_output_csv, sep = ',') # strip white space from both ends in all columns dynamut_data = dynamut_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x) dforig_shape = dynamut_data.shape print('dimensions of input file:', dforig_shape) #%%============================================================================ ##################################### # create binary cols for each param # >=0: Stabilising ###################################### outcome_cols = ['ddg_dynamut', 'ddg_encom', 'ddg_mcsm','ddg_sdm', 'ddg_duet'] # col test: ddg_dynamut #len(dynamut_data[dynamut_data['ddg_dynamut'] >= 0]) #dynamut_data['ddg_dynamut_outcome'] = dynamut_data['ddg_dynamut'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising') #len(dynamut_data[dynamut_data['ddg_dynamut_outcome'] == 'Stabilising']) print('\nCreating classification cols for', len(outcome_cols), 'columns' , '\nThese are:') for cols in outcome_cols: print(cols) tot_muts = dynamut_data[cols].count() print('\nTotal entries:', tot_muts) outcome_colname = cols + '_outcome' print(cols, ':', outcome_colname) c1 = len(dynamut_data[dynamut_data[cols] >= 0]) dynamut_data[outcome_colname] = dynamut_data[cols].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising') c2 = len(dynamut_data[dynamut_data[outcome_colname] == 'Stabilising']) if c1 == c2: print('\nPASS: outcome classification column created successfully' , '\nColumn created:', outcome_colname #, '\nNo. of stabilising muts: ', c1 #, '\nNo. of DEstabilising muts: ', tot_muts-c1 , '\n\nCateg counts:\n', dynamut_data[outcome_colname].value_counts() ) else: print('\nFAIL: outcome classification numbers MISmatch' , '\nexpected length:', c1 , '\nGot:', c2) # Rename categ for: dds_encom len(dynamut_data[dynamut_data['dds_encom'] >= 0]) dynamut_data['dds_encom_outcome'] = dynamut_data['dds_encom'].apply(lambda x: 'Increased_flexibility' if x >= 0 else 'Decreased_flexibility') dynamut_data['dds_encom_outcome'].value_counts() #%%===================================================================== ################################ # scale all ddg param values ################################# # Rescale values in all ddg cols col b/w -1 and 1 so negative numbers # stay neg and pos numbers stay positive outcome_cols = ['ddg_dynamut', 'ddg_encom', 'ddg_mcsm','ddg_sdm', 'ddg_duet', 'dds_encom'] for cols in outcome_cols: #print(cols) col_max = dynamut_data[cols].max() col_min = dynamut_data[cols].min() print( '\n====================' , '\nColname:', cols , '\n====================' , '\nMax: ', col_max , '\nMin: ', col_min) scaled_colname = cols + '_scaled' print('\nCreated scaled colname for', cols, ':', scaled_colname) col_scale = lambda x : x/abs(col_min) if x < 0 else (x/col_max if x >= 0 else 'failed') dynamut_data[scaled_colname] = dynamut_data[cols].apply(col_scale) col_scaled_max = dynamut_data[scaled_colname].max() col_scaled_min = dynamut_data[scaled_colname].min() print( '\n====================' , '\nColname:', scaled_colname , '\n====================' , '\nMax: ', col_scaled_max , '\nMin: ', col_scaled_min) #%%===================================================================== ############# # reorder columns ############# dynamut_data.columns dynamut_dataf = dynamut_data[['mutationinformation' , 'ddg_dynamut' , 'ddg_dynamut_scaled' , 'ddg_dynamut_outcome' , 'ddg_encom' , 'ddg_encom_scaled' , 'ddg_encom_outcome' , 'ddg_mcsm' , 'ddg_mcsm_scaled' , 'ddg_mcsm_outcome' , 'ddg_sdm' , 'ddg_sdm_scaled' , 'ddg_sdm_outcome' , 'ddg_duet' , 'ddg_duet_scaled' , 'ddg_duet_outcome' , 'dds_encom' , 'dds_encom_scaled' , 'dds_encom_outcome']] if len(dynamut_data.columns) == len(dynamut_dataf): print('\nPASS: outcome_classification, scaling and column reordering completed') else: print('\nFAIL: Something went wrong...' , '\nExpected length: ', len(dynamut_data.columns) , '\nGot: ', len(dynamut_dataf)) return(dynamut_dataf) #%%#####################################################################