added dynamut results formatting scripts, althouh needs to be rerun once b7 completes
This commit is contained in:
parent
64669eb05f
commit
5529fbf63d
2 changed files with 214 additions and 0 deletions
161
dynamut/format_results_dynamut.py
Normal file
161
dynamut/format_results_dynamut.py
Normal file
|
@ -0,0 +1,161 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Wed Aug 19 14:33:51 2020
|
||||||
|
|
||||||
|
@author: tanu
|
||||||
|
"""
|
||||||
|
#%% load packages
|
||||||
|
import os,sys
|
||||||
|
import subprocess
|
||||||
|
import argparse
|
||||||
|
import requests
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from pandas.api.types import is_string_dtype
|
||||||
|
from pandas.api.types import is_numeric_dtype
|
||||||
|
#%%#####################################################################
|
||||||
|
def format_dynamut_output(dynamut_output_csv):
|
||||||
|
"""
|
||||||
|
@param dynamut_output_csv: file containing dynamut results for all muts
|
||||||
|
which is the result of combining all dynamut_output batch results, and using
|
||||||
|
bash scripts to combine all the batch results into one file.
|
||||||
|
This is post run_get_results_dynamut.py
|
||||||
|
Formatting df to a pandas df and output as csv.
|
||||||
|
@type string
|
||||||
|
|
||||||
|
@return (not true) formatted csv for dynamut output
|
||||||
|
@type pandas df
|
||||||
|
|
||||||
|
"""
|
||||||
|
#############
|
||||||
|
# Read file
|
||||||
|
#############
|
||||||
|
dynamut_data_raw = pd.read_csv(dynamut_output_csv, sep = ',')
|
||||||
|
|
||||||
|
# strip white space from both ends in all columns
|
||||||
|
dynamut_data = dynamut_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
|
||||||
|
|
||||||
|
dforig_shape = dynamut_data.shape
|
||||||
|
print('dimensions of input file:', dforig_shape)
|
||||||
|
|
||||||
|
#%%============================================================================
|
||||||
|
#####################################
|
||||||
|
# create binary cols for each param
|
||||||
|
# >=0: Stabilising
|
||||||
|
######################################
|
||||||
|
outcome_cols = ['ddg_dynamut', 'ddg_encom', 'ddg_mcsm','ddg_sdm', 'ddg_duet']
|
||||||
|
|
||||||
|
# col test: ddg_dynamut
|
||||||
|
#len(dynamut_data[dynamut_data['ddg_dynamut'] >= 0])
|
||||||
|
#dynamut_data['ddg_dynamut_outcome'] = dynamut_data['ddg_dynamut'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
|
||||||
|
#len(dynamut_data[dynamut_data['ddg_dynamut_outcome'] == 'Stabilising'])
|
||||||
|
|
||||||
|
print('\nCreating classification cols for', len(outcome_cols), 'columns'
|
||||||
|
, '\nThese are:')
|
||||||
|
|
||||||
|
for cols in outcome_cols:
|
||||||
|
print(cols)
|
||||||
|
|
||||||
|
tot_muts = dynamut_data[cols].count()
|
||||||
|
print('\nTotal entries:', tot_muts)
|
||||||
|
|
||||||
|
outcome_colname = cols + '_outcome'
|
||||||
|
print(cols, ':', outcome_colname)
|
||||||
|
c1 = len(dynamut_data[dynamut_data[cols] >= 0])
|
||||||
|
dynamut_data[outcome_colname] = dynamut_data[cols].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
|
||||||
|
c2 = len(dynamut_data[dynamut_data[outcome_colname] == 'Stabilising'])
|
||||||
|
if c1 == c2:
|
||||||
|
print('\nPASS: outcome classification column created successfully'
|
||||||
|
, '\nColumn created:', outcome_colname
|
||||||
|
#, '\nNo. of stabilising muts: ', c1
|
||||||
|
#, '\nNo. of DEstabilising muts: ', tot_muts-c1
|
||||||
|
, '\n\nCateg counts:\n', dynamut_data[outcome_colname].value_counts() )
|
||||||
|
|
||||||
|
else:
|
||||||
|
print('\nFAIL: outcome classification numbers MISmatch'
|
||||||
|
, '\nexpected length:', c1
|
||||||
|
, '\nGot:', c2)
|
||||||
|
|
||||||
|
# Rename categ for: dds_encom
|
||||||
|
len(dynamut_data[dynamut_data['dds_encom'] >= 0])
|
||||||
|
dynamut_data['dds_encom_outcome'] = dynamut_data['dds_encom'].apply(lambda x: 'Increased_flexibility' if x >= 0 else 'Decreased_flexibility')
|
||||||
|
dynamut_data['dds_encom_outcome'].value_counts()
|
||||||
|
|
||||||
|
#%%=====================================================================
|
||||||
|
################################
|
||||||
|
# scale all ddg param values
|
||||||
|
#################################
|
||||||
|
# Rescale values in all ddg cols col b/w -1 and 1 so negative numbers
|
||||||
|
# stay neg and pos numbers stay positive
|
||||||
|
|
||||||
|
outcome_cols = ['ddg_dynamut', 'ddg_encom', 'ddg_mcsm','ddg_sdm', 'ddg_duet', 'dds_encom']
|
||||||
|
|
||||||
|
for cols in outcome_cols:
|
||||||
|
#print(cols)
|
||||||
|
col_max = dynamut_data[cols].max()
|
||||||
|
col_min = dynamut_data[cols].min()
|
||||||
|
print( '\n===================='
|
||||||
|
, '\nColname:', cols
|
||||||
|
, '\n===================='
|
||||||
|
, '\nMax: ', col_max
|
||||||
|
, '\nMin: ', col_min)
|
||||||
|
|
||||||
|
scaled_colname = cols + '_scaled'
|
||||||
|
print('\nCreated scaled colname for', cols, ':', scaled_colname)
|
||||||
|
col_scale = lambda x : x/abs(col_min) if x < 0 else (x/col_max if x >= 0 else 'failed')
|
||||||
|
|
||||||
|
dynamut_data[scaled_colname] = dynamut_data[cols].apply(col_scale)
|
||||||
|
|
||||||
|
col_scaled_max = dynamut_data[scaled_colname].max()
|
||||||
|
col_scaled_min = dynamut_data[scaled_colname].min()
|
||||||
|
print( '\n===================='
|
||||||
|
, '\nColname:', scaled_colname
|
||||||
|
, '\n===================='
|
||||||
|
, '\nMax: ', col_scaled_max
|
||||||
|
, '\nMin: ', col_scaled_min)
|
||||||
|
|
||||||
|
#%%=====================================================================
|
||||||
|
#############
|
||||||
|
# reorder columns
|
||||||
|
#############
|
||||||
|
dynamut_data.columns
|
||||||
|
dynamut_dataf = dynamut_data[['mutationinformation'
|
||||||
|
|
||||||
|
, 'ddg_dynamut'
|
||||||
|
, 'ddg_dynamut_scaled'
|
||||||
|
, 'ddg_dynamut_outcome'
|
||||||
|
|
||||||
|
, 'ddg_encom'
|
||||||
|
, 'ddg_encom_scaled'
|
||||||
|
, 'ddg_encom_outcome'
|
||||||
|
|
||||||
|
, 'ddg_mcsm'
|
||||||
|
, 'ddg_mcsm_scaled'
|
||||||
|
, 'ddg_mcsm_outcome'
|
||||||
|
|
||||||
|
, 'ddg_sdm'
|
||||||
|
, 'ddg_sdm_scaled'
|
||||||
|
, 'ddg_sdm_outcome'
|
||||||
|
|
||||||
|
, 'ddg_duet'
|
||||||
|
, 'ddg_duet_scaled'
|
||||||
|
, 'ddg_duet_outcome'
|
||||||
|
|
||||||
|
, 'dds_encom'
|
||||||
|
, 'dds_encom_scaled'
|
||||||
|
, 'dds_encom_outcome']]
|
||||||
|
|
||||||
|
if len(dynamut_data.columns) == len(dynamut_dataf):
|
||||||
|
print('\nPASS: outcome_classification, scaling and column reordering completed')
|
||||||
|
else:
|
||||||
|
print('\nFAIL: Something went wrong...'
|
||||||
|
, '\nExpected length: ', len(dynamut_data.columns)
|
||||||
|
, '\nGot: ', len(dynamut_dataf))
|
||||||
|
|
||||||
|
return(dynamut_dataf)
|
||||||
|
#%%#####################################################################
|
||||||
|
|
53
dynamut/run_format_results_dynamut.py
Normal file
53
dynamut/run_format_results_dynamut.py
Normal file
|
@ -0,0 +1,53 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Fri Feb 12 12:15:26 2021
|
||||||
|
|
||||||
|
@author: tanu
|
||||||
|
"""
|
||||||
|
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||||
|
# FIXME
|
||||||
|
# RE RUN when B07 completes!!!! as norm gets affected!
|
||||||
|
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||||
|
|
||||||
|
#%% load packages
|
||||||
|
import os
|
||||||
|
homedir = os.path.expanduser('~')
|
||||||
|
os.chdir (homedir + '/git/LSHTM_analysis/dynamut')
|
||||||
|
from format_results_dynamut import *
|
||||||
|
########################################################################
|
||||||
|
# variables
|
||||||
|
|
||||||
|
# TODO: add cmd line args
|
||||||
|
|
||||||
|
gene = 'gid'
|
||||||
|
drug = 'streptomycin'
|
||||||
|
datadir = homedir + '/git/Data'
|
||||||
|
indir = datadir + '/' + drug + '/input'
|
||||||
|
outdir = datadir + '/' + drug + '/output'
|
||||||
|
outdir_dynamut = outdir + '/dynamut_results/'
|
||||||
|
|
||||||
|
# Input file
|
||||||
|
infile_dynamut = outdir_dynamut + gene + '_dynamut_all_output_clean.csv'
|
||||||
|
|
||||||
|
# Formatted output filename
|
||||||
|
outfile_dynamut_f = outdir_dynamut + gene + '_complex_dynamut_norm.csv'
|
||||||
|
|
||||||
|
#==========================
|
||||||
|
# CALL: format_results_mcsm_na()
|
||||||
|
# Data: gid+streptomycin
|
||||||
|
#==========================
|
||||||
|
print('Formatting results for:', infile_dynamut)
|
||||||
|
dynamut_df_f = format_dynamut_output(dynamut_output_csv = infile_dynamut)
|
||||||
|
|
||||||
|
# writing file
|
||||||
|
print('Writing formatted dynamut df to csv')
|
||||||
|
dynamut_df_f.to_csv(outfile_dynamut_f, index = False)
|
||||||
|
|
||||||
|
print('Finished writing file:'
|
||||||
|
, '\nFile:', outfile_dynamut_f
|
||||||
|
, '\nExpected no. of rows:', len(dynamut_df_f)
|
||||||
|
, '\nExpected no. of cols:', len(dynamut_df_f.columns)
|
||||||
|
, '\n=============================================================')
|
||||||
|
|
||||||
|
#%%#####################################################################
|
Loading…
Add table
Add a link
Reference in a new issue