From 1ea42097aec3fd44404385daa69db9ff6429268c Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Fri, 13 Aug 2021 13:24:22 +0100 Subject: [PATCH] added dynamut results formatting scripts, althouh needs to be rerun once b7 completes --- dynamut/format_results_dynamut.py | 161 ++++++++++++++++++++++++++ dynamut/run_format_results_dynamut.py | 53 +++++++++ 2 files changed, 214 insertions(+) create mode 100644 dynamut/format_results_dynamut.py create mode 100644 dynamut/run_format_results_dynamut.py diff --git a/dynamut/format_results_dynamut.py b/dynamut/format_results_dynamut.py new file mode 100644 index 0000000..261bb54 --- /dev/null +++ b/dynamut/format_results_dynamut.py @@ -0,0 +1,161 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed Aug 19 14:33:51 2020 + +@author: tanu +""" +#%% load packages +import os,sys +import subprocess +import argparse +import requests +import re +import time +from bs4 import BeautifulSoup +import pandas as pd +import numpy as np +from pandas.api.types import is_string_dtype +from pandas.api.types import is_numeric_dtype +#%%##################################################################### +def format_dynamut_output(dynamut_output_csv): + """ + @param dynamut_output_csv: file containing dynamut results for all muts + which is the result of combining all dynamut_output batch results, and using + bash scripts to combine all the batch results into one file. + This is post run_get_results_dynamut.py + Formatting df to a pandas df and output as csv. + @type string + + @return (not true) formatted csv for dynamut output + @type pandas df + + """ + ############# + # Read file + ############# + dynamut_data_raw = pd.read_csv(dynamut_output_csv, sep = ',') + + # strip white space from both ends in all columns + dynamut_data = dynamut_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x) + + dforig_shape = dynamut_data.shape + print('dimensions of input file:', dforig_shape) + +#%%============================================================================ + ##################################### + # create binary cols for each param + # >=0: Stabilising + ###################################### + outcome_cols = ['ddg_dynamut', 'ddg_encom', 'ddg_mcsm','ddg_sdm', 'ddg_duet'] + + # col test: ddg_dynamut + #len(dynamut_data[dynamut_data['ddg_dynamut'] >= 0]) + #dynamut_data['ddg_dynamut_outcome'] = dynamut_data['ddg_dynamut'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising') + #len(dynamut_data[dynamut_data['ddg_dynamut_outcome'] == 'Stabilising']) + + print('\nCreating classification cols for', len(outcome_cols), 'columns' + , '\nThese are:') + + for cols in outcome_cols: + print(cols) + + tot_muts = dynamut_data[cols].count() + print('\nTotal entries:', tot_muts) + + outcome_colname = cols + '_outcome' + print(cols, ':', outcome_colname) + c1 = len(dynamut_data[dynamut_data[cols] >= 0]) + dynamut_data[outcome_colname] = dynamut_data[cols].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising') + c2 = len(dynamut_data[dynamut_data[outcome_colname] == 'Stabilising']) + if c1 == c2: + print('\nPASS: outcome classification column created successfully' + , '\nColumn created:', outcome_colname + #, '\nNo. of stabilising muts: ', c1 + #, '\nNo. of DEstabilising muts: ', tot_muts-c1 + , '\n\nCateg counts:\n', dynamut_data[outcome_colname].value_counts() ) + + else: + print('\nFAIL: outcome classification numbers MISmatch' + , '\nexpected length:', c1 + , '\nGot:', c2) + + # Rename categ for: dds_encom + len(dynamut_data[dynamut_data['dds_encom'] >= 0]) + dynamut_data['dds_encom_outcome'] = dynamut_data['dds_encom'].apply(lambda x: 'Increased_flexibility' if x >= 0 else 'Decreased_flexibility') + dynamut_data['dds_encom_outcome'].value_counts() + +#%%===================================================================== + ################################ + # scale all ddg param values + ################################# + # Rescale values in all ddg cols col b/w -1 and 1 so negative numbers + # stay neg and pos numbers stay positive + + outcome_cols = ['ddg_dynamut', 'ddg_encom', 'ddg_mcsm','ddg_sdm', 'ddg_duet', 'dds_encom'] + + for cols in outcome_cols: + #print(cols) + col_max = dynamut_data[cols].max() + col_min = dynamut_data[cols].min() + print( '\n====================' + , '\nColname:', cols + , '\n====================' + , '\nMax: ', col_max + , '\nMin: ', col_min) + + scaled_colname = cols + '_scaled' + print('\nCreated scaled colname for', cols, ':', scaled_colname) + col_scale = lambda x : x/abs(col_min) if x < 0 else (x/col_max if x >= 0 else 'failed') + + dynamut_data[scaled_colname] = dynamut_data[cols].apply(col_scale) + + col_scaled_max = dynamut_data[scaled_colname].max() + col_scaled_min = dynamut_data[scaled_colname].min() + print( '\n====================' + , '\nColname:', scaled_colname + , '\n====================' + , '\nMax: ', col_scaled_max + , '\nMin: ', col_scaled_min) + +#%%===================================================================== + ############# + # reorder columns + ############# + dynamut_data.columns + dynamut_dataf = dynamut_data[['mutationinformation' + + , 'ddg_dynamut' + , 'ddg_dynamut_scaled' + , 'ddg_dynamut_outcome' + + , 'ddg_encom' + , 'ddg_encom_scaled' + , 'ddg_encom_outcome' + + , 'ddg_mcsm' + , 'ddg_mcsm_scaled' + , 'ddg_mcsm_outcome' + + , 'ddg_sdm' + , 'ddg_sdm_scaled' + , 'ddg_sdm_outcome' + + , 'ddg_duet' + , 'ddg_duet_scaled' + , 'ddg_duet_outcome' + + , 'dds_encom' + , 'dds_encom_scaled' + , 'dds_encom_outcome']] + + if len(dynamut_data.columns) == len(dynamut_dataf): + print('\nPASS: outcome_classification, scaling and column reordering completed') + else: + print('\nFAIL: Something went wrong...' + , '\nExpected length: ', len(dynamut_data.columns) + , '\nGot: ', len(dynamut_dataf)) + + return(dynamut_dataf) +#%%##################################################################### + diff --git a/dynamut/run_format_results_dynamut.py b/dynamut/run_format_results_dynamut.py new file mode 100644 index 0000000..c8922e0 --- /dev/null +++ b/dynamut/run_format_results_dynamut.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Feb 12 12:15:26 2021 + +@author: tanu +""" +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +# FIXME +# RE RUN when B07 completes!!!! as norm gets affected! +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +#%% load packages +import os +homedir = os.path.expanduser('~') +os.chdir (homedir + '/git/LSHTM_analysis/dynamut') +from format_results_dynamut import * +######################################################################## +# variables + +# TODO: add cmd line args + +gene = 'gid' +drug = 'streptomycin' +datadir = homedir + '/git/Data' +indir = datadir + '/' + drug + '/input' +outdir = datadir + '/' + drug + '/output' +outdir_dynamut = outdir + '/dynamut_results/' + +# Input file +infile_dynamut = outdir_dynamut + gene + '_dynamut_all_output_clean.csv' + +# Formatted output filename +outfile_dynamut_f = outdir_dynamut + gene + '_complex_dynamut_norm.csv' + +#========================== +# CALL: format_results_mcsm_na() +# Data: gid+streptomycin +#========================== +print('Formatting results for:', infile_dynamut) +dynamut_df_f = format_dynamut_output(dynamut_output_csv = infile_dynamut) + +# writing file +print('Writing formatted dynamut df to csv') +dynamut_df_f.to_csv(outfile_dynamut_f, index = False) + +print('Finished writing file:' + , '\nFile:', outfile_dynamut_f + , '\nExpected no. of rows:', len(dynamut_df_f) + , '\nExpected no. of cols:', len(dynamut_df_f.columns) + , '\n=============================================================') + +#%%##################################################################### \ No newline at end of file