adde format_results_dynamut2.py and ran shiny scripts for barplots

This commit is contained in:
Tanushree Tunstall 2021-08-19 16:25:38 +01:00
parent 8cdf720702
commit f7aac58081
9 changed files with 235 additions and 59 deletions

View file

@ -123,7 +123,7 @@ def format_dynamut_output(dynamut_output_csv):
# reorder columns
#############
dynamut_data.columns
dynamut_dataf = dynamut_data[['mutationinformation'
dynamut_data_f = dynamut_data[['mutationinformation'
, 'ddg_dynamut'
, 'ddg_dynamut_scaled'
@ -149,13 +149,14 @@ def format_dynamut_output(dynamut_output_csv):
, 'dds_encom_scaled'
, 'dds_encom_outcome']]
if len(dynamut_data.columns) == len(dynamut_dataf):
if len(dynamut_data.columns) == len(dynamut_data_f.columns) and sorted(dynamut_data.columns) == sorted(dynamut_data_f.columns):
print('\nPASS: outcome_classification, scaling and column reordering completed')
else:
print('\nFAIL: Something went wrong...'
, '\nExpected length: ', len(dynamut_data.columns)
, '\nGot: ', len(dynamut_dataf))
, '\nGot: ', len(dynamut_data_f.columns))
sys.exit()
return(dynamut_dataf)
return(dynamut_data_f)
#%%#####################################################################

View file

@ -0,0 +1,137 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Aug 19 14:33:51 2020
@author: tanu
"""
#%% load packages
import os,sys
import subprocess
import argparse
import requests
import re
import time
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
#%%#####################################################################
def format_dynamut2_output(dynamut_output_csv):
"""
@param dynamut_output_csv: file containing dynamut2 results for all muts
which is the result of combining all dynamut2_output batch results, and using
bash scripts to combine all the batch results into one file.
Dynamut2ran manually from batches
Formatting df to a pandas df and output as csv.
@type string
@return (not true) formatted csv for dynamut output
@type pandas df
"""
#############
# Read file
#############
dynamut_data_raw = pd.read_csv(dynamut_output_csv, sep = ',')
# strip white space from both ends in all columns
dynamut_data = dynamut_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
dforig_shape = dynamut_data.shape
print('dimensions of input file:', dforig_shape)
#%%============================================================================
#####################################
# create binary cols for ddg_dynamut2
# >=0: Stabilising
######################################
outcome_cols = ['ddg_dynamut2']
# col test: ddg_dynamut
#len(dynamut_data[dynamut_data['ddg_dynamut'] >= 0])
#dynamut_data['ddg_dynamut_outcome'] = dynamut_data['ddg_dynamut'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
#len(dynamut_data[dynamut_data['ddg_dynamut_outcome'] == 'Stabilising'])
print('\nCreating classification cols for', len(outcome_cols), 'columns'
, '\nThese are:')
for cols in outcome_cols:
print(cols)
tot_muts = dynamut_data[cols].count()
print('\nTotal entries:', tot_muts)
outcome_colname = cols + '_outcome'
print(cols, ':', outcome_colname)
c1 = len(dynamut_data[dynamut_data[cols] >= 0])
dynamut_data[outcome_colname] = dynamut_data[cols].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
c2 = len(dynamut_data[dynamut_data[outcome_colname] == 'Stabilising'])
if c1 == c2:
print('\nPASS: outcome classification column created successfully'
, '\nColumn created:', outcome_colname
#, '\nNo. of stabilising muts: ', c1
#, '\nNo. of DEstabilising muts: ', tot_muts-c1
, '\n\nCateg counts:\n', dynamut_data[outcome_colname].value_counts() )
else:
print('\nFAIL: outcome classification numbers MISmatch'
, '\nexpected length:', c1
, '\nGot:', c2)
#%%=====================================================================
################################
# scale all ddg_dynamut2 values
#################################
# Rescale values in all ddg_dynamut2 col col b/w -1 and 1 so negative numbers
# stay neg and pos numbers stay positive
outcome_cols = ['ddg_dynamut2']
for cols in outcome_cols:
#print(cols)
col_max = dynamut_data[cols].max()
col_min = dynamut_data[cols].min()
print( '\n===================='
, '\nColname:', cols
, '\n===================='
, '\nMax: ', col_max
, '\nMin: ', col_min)
scaled_colname = cols + '_scaled'
print('\nCreated scaled colname for', cols, ':', scaled_colname)
col_scale = lambda x : x/abs(col_min) if x < 0 else (x/col_max if x >= 0 else 'failed')
dynamut_data[scaled_colname] = dynamut_data[cols].apply(col_scale)
col_scaled_max = dynamut_data[scaled_colname].max()
col_scaled_min = dynamut_data[scaled_colname].min()
print( '\n===================='
, '\nColname:', scaled_colname
, '\n===================='
, '\nMax: ', col_scaled_max
, '\nMin: ', col_scaled_min)
#%%=====================================================================
#############
# reorder columns
#############
dynamut_data.columns
dynamut_data_f = dynamut_data[['mutationinformation'
, 'chain'
, 'ddg_dynamut2'
, 'ddg_dynamut2_scaled'
, 'ddg_dynamut2_outcome']]
if len(dynamut_data.columns) == len(dynamut_data_f.columns) and sorted(dynamut_data.columns) == sorted(dynamut_data_f.columns):
print('\nPASS: outcome_classification, scaling and column reordering completed')
else:
print('\nFAIL: Something went wrong...'
, '\nExpected length: ', len(dynamut_data.columns)
, '\nGot: ', len(dynamut_data_f.columns))
sys.exit()
return(dynamut_data_f)
#%%#####################################################################

View file

@ -15,9 +15,9 @@ import os
homedir = os.path.expanduser('~')
os.chdir (homedir + '/git/LSHTM_analysis/dynamut')
from format_results_dynamut import *
from format_results_dynamut2 import *
########################################################################
# variables
# TODO: add cmd line args
gene = 'gid'
@ -26,28 +26,47 @@ datadir = homedir + '/git/Data'
indir = datadir + '/' + drug + '/input'
outdir = datadir + '/' + drug + '/output'
outdir_dynamut = outdir + '/dynamut_results/'
outdir_dynamut2 = outdir + '/dynamut_results/dynamut2/'
# Input file
infile_dynamut = outdir_dynamut + gene + '_dynamut_all_output_clean.csv'
infile_dynamut2 = outdir_dynamut2 + gene + '_dynamut2_output_combined_clean.csv'
# Formatted output filename
outfile_dynamut_f = outdir_dynamut + gene + '_complex_dynamut_norm.csv'
outfile_dynamut_f = outdir_dynamut2 + gene + '_complex_dynamut_norm.csv'
outfile_dynamut2_f = outdir_dynamut2 + gene + '_complex_dynamut2_norm.csv'
#==========================
# CALL: format_results_mcsm_na()
# Data: gid+streptomycin
#==========================
print('Formatting results for:', infile_dynamut)
dynamut_df_f = format_dynamut_output(dynamut_output_csv = infile_dynamut)
#===============================
# CALL: format_results_dynamut
# DYNAMUT results
# #===============================
# print('Formatting results for:', infile_dynamut)
# dynamut_df_f = format_dynamut_output(infile_dynamut)
# # writing file
# print('Writing formatted dynamut df to csv')
# dynamut_df_f.to_csv(outfile_dynamut_f, index = False)
# print('Finished writing file:'
# , '\nFile:', outfile_dynamut_f
# , '\nExpected no. of rows:', len(dynamut_df_f)
# , '\nExpected no. of cols:', len(dynamut_df_f.columns)
# , '\n=============================================================')
#===============================
# CALL: format_results_dynamut2
# DYNAMUT2 results
#===============================
print('Formatting results for:', infile_dynamut2)
dynamut2_df_f = format_dynamut2_output(infile_dynamut2) # dynamut2
# writing file
print('Writing formatted dynamut df to csv')
dynamut_df_f.to_csv(outfile_dynamut_f, index = False)
print('Writing formatted dynamut2 df to csv')
dynamut2_df_f.to_csv(outfile_dynamut2_f, index = False)
print('Finished writing file:'
, '\nFile:', outfile_dynamut_f
, '\nExpected no. of rows:', len(dynamut_df_f)
, '\nExpected no. of cols:', len(dynamut_df_f.columns)
, '\nFile:', outfile_dynamut2_f
, '\nExpected no. of rows:', len(dynamut2_df_f)
, '\nExpected no. of cols:', len(dynamut2_df_f.columns)
, '\n=============================================================')
#%%#####################################################################

View file

@ -24,20 +24,9 @@ indir = datadir + drug + '/input/'
outdir = datadir + drug + '/output/'
outdir_dynamut_temp = outdir + 'dynamut_results/dynamut_temp/'
#==============================================================================
# batch 8: 08.txt, # RETRIEVED 23 Feb 08:54
#my_url_file = outdir + '/dynamut_temp/dynamut_result_url_gid_b8.txt'
#my_suffix = 'gid_b7'
#b09 and b10 failed, ran by Carlos, and returned results on 12 Aug
# batch 9 and 10: RETRIEVED 12 Aug 09:25
#my_url_file = outdir_dynamut_temp + 'dynamut_result_url_gid_b10.txt'
#my_suffix = 'gid_b10'
# batch10_21: from bissection: humour me! (don't need since b10 ran
# from dynamut team, but still its ready to extract it!) RETRIEVED 12 Aug 17:37
my_url_file = outdir_dynamut_temp + 'dynamut_result_url_gid_b10_21.txt'
my_suffix = 'gid_b10_21'
# batch 7 (previously 1b file): RETRIEVED 17 Aug 16:40
my_url_file = outdir_dynamut_temp + 'dynamut_result_url_gid_b7.txt'
my_suffix = 'gid_b7'
#==============================================================================
#==========================
@ -52,4 +41,4 @@ get_results(url_file = my_url_file
, output_dir = outdir
, outfile_suffix = my_suffix)
########################################################################
########################################################################