From f7aac580814358c67976a36e1fac347a3f701604 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 19 Aug 2021 16:25:38 +0100 Subject: [PATCH] adde format_results_dynamut2.py and ran shiny scripts for barplots --- dynamut/format_results_dynamut.py | 9 +- dynamut/format_results_dynamut2.py | 137 ++++++++++++++++++ dynamut/run_format_results_dynamut.py | 45 ++++-- dynamut/run_get_results_dynamut.py | 19 +-- scripts/functions/plotting_data.R | 2 +- scripts/functions/position_count_bp.R | 25 +++- scripts/functions/stability_count_bp.R | 25 +++- scripts/functions/test_bp.R | 30 ++-- .../plotting/redundant/basic_barplots_PS.R | 2 +- 9 files changed, 235 insertions(+), 59 deletions(-) create mode 100644 dynamut/format_results_dynamut2.py diff --git a/dynamut/format_results_dynamut.py b/dynamut/format_results_dynamut.py index 261bb54..05dbd0c 100644 --- a/dynamut/format_results_dynamut.py +++ b/dynamut/format_results_dynamut.py @@ -123,7 +123,7 @@ def format_dynamut_output(dynamut_output_csv): # reorder columns ############# dynamut_data.columns - dynamut_dataf = dynamut_data[['mutationinformation' + dynamut_data_f = dynamut_data[['mutationinformation' , 'ddg_dynamut' , 'ddg_dynamut_scaled' @@ -149,13 +149,14 @@ def format_dynamut_output(dynamut_output_csv): , 'dds_encom_scaled' , 'dds_encom_outcome']] - if len(dynamut_data.columns) == len(dynamut_dataf): + if len(dynamut_data.columns) == len(dynamut_data_f.columns) and sorted(dynamut_data.columns) == sorted(dynamut_data_f.columns): print('\nPASS: outcome_classification, scaling and column reordering completed') else: print('\nFAIL: Something went wrong...' , '\nExpected length: ', len(dynamut_data.columns) - , '\nGot: ', len(dynamut_dataf)) + , '\nGot: ', len(dynamut_data_f.columns)) + sys.exit() - return(dynamut_dataf) + return(dynamut_data_f) #%%##################################################################### diff --git a/dynamut/format_results_dynamut2.py b/dynamut/format_results_dynamut2.py new file mode 100644 index 0000000..fe806e8 --- /dev/null +++ b/dynamut/format_results_dynamut2.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed Aug 19 14:33:51 2020 + +@author: tanu +""" +#%% load packages +import os,sys +import subprocess +import argparse +import requests +import re +import time +from bs4 import BeautifulSoup +import pandas as pd +import numpy as np +from pandas.api.types import is_string_dtype +from pandas.api.types import is_numeric_dtype +#%%##################################################################### +def format_dynamut2_output(dynamut_output_csv): + """ + @param dynamut_output_csv: file containing dynamut2 results for all muts + which is the result of combining all dynamut2_output batch results, and using + bash scripts to combine all the batch results into one file. + Dynamut2ran manually from batches + Formatting df to a pandas df and output as csv. + @type string + + @return (not true) formatted csv for dynamut output + @type pandas df + + """ + ############# + # Read file + ############# + dynamut_data_raw = pd.read_csv(dynamut_output_csv, sep = ',') + + # strip white space from both ends in all columns + dynamut_data = dynamut_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x) + + dforig_shape = dynamut_data.shape + print('dimensions of input file:', dforig_shape) + +#%%============================================================================ + ##################################### + # create binary cols for ddg_dynamut2 + # >=0: Stabilising + ###################################### + outcome_cols = ['ddg_dynamut2'] + + # col test: ddg_dynamut + #len(dynamut_data[dynamut_data['ddg_dynamut'] >= 0]) + #dynamut_data['ddg_dynamut_outcome'] = dynamut_data['ddg_dynamut'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising') + #len(dynamut_data[dynamut_data['ddg_dynamut_outcome'] == 'Stabilising']) + + print('\nCreating classification cols for', len(outcome_cols), 'columns' + , '\nThese are:') + + for cols in outcome_cols: + print(cols) + + tot_muts = dynamut_data[cols].count() + print('\nTotal entries:', tot_muts) + + outcome_colname = cols + '_outcome' + print(cols, ':', outcome_colname) + c1 = len(dynamut_data[dynamut_data[cols] >= 0]) + dynamut_data[outcome_colname] = dynamut_data[cols].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising') + c2 = len(dynamut_data[dynamut_data[outcome_colname] == 'Stabilising']) + if c1 == c2: + print('\nPASS: outcome classification column created successfully' + , '\nColumn created:', outcome_colname + #, '\nNo. of stabilising muts: ', c1 + #, '\nNo. of DEstabilising muts: ', tot_muts-c1 + , '\n\nCateg counts:\n', dynamut_data[outcome_colname].value_counts() ) + + else: + print('\nFAIL: outcome classification numbers MISmatch' + , '\nexpected length:', c1 + , '\nGot:', c2) + +#%%===================================================================== + ################################ + # scale all ddg_dynamut2 values + ################################# + # Rescale values in all ddg_dynamut2 col col b/w -1 and 1 so negative numbers + # stay neg and pos numbers stay positive + + outcome_cols = ['ddg_dynamut2'] + + for cols in outcome_cols: + #print(cols) + col_max = dynamut_data[cols].max() + col_min = dynamut_data[cols].min() + print( '\n====================' + , '\nColname:', cols + , '\n====================' + , '\nMax: ', col_max + , '\nMin: ', col_min) + + scaled_colname = cols + '_scaled' + print('\nCreated scaled colname for', cols, ':', scaled_colname) + col_scale = lambda x : x/abs(col_min) if x < 0 else (x/col_max if x >= 0 else 'failed') + + dynamut_data[scaled_colname] = dynamut_data[cols].apply(col_scale) + + col_scaled_max = dynamut_data[scaled_colname].max() + col_scaled_min = dynamut_data[scaled_colname].min() + print( '\n====================' + , '\nColname:', scaled_colname + , '\n====================' + , '\nMax: ', col_scaled_max + , '\nMin: ', col_scaled_min) + +#%%===================================================================== + ############# + # reorder columns + ############# + dynamut_data.columns + dynamut_data_f = dynamut_data[['mutationinformation' + , 'chain' + , 'ddg_dynamut2' + , 'ddg_dynamut2_scaled' + , 'ddg_dynamut2_outcome']] + + if len(dynamut_data.columns) == len(dynamut_data_f.columns) and sorted(dynamut_data.columns) == sorted(dynamut_data_f.columns): + print('\nPASS: outcome_classification, scaling and column reordering completed') + else: + print('\nFAIL: Something went wrong...' + , '\nExpected length: ', len(dynamut_data.columns) + , '\nGot: ', len(dynamut_data_f.columns)) + sys.exit() + + return(dynamut_data_f) +#%%##################################################################### + diff --git a/dynamut/run_format_results_dynamut.py b/dynamut/run_format_results_dynamut.py index c8922e0..02af524 100644 --- a/dynamut/run_format_results_dynamut.py +++ b/dynamut/run_format_results_dynamut.py @@ -15,9 +15,9 @@ import os homedir = os.path.expanduser('~') os.chdir (homedir + '/git/LSHTM_analysis/dynamut') from format_results_dynamut import * +from format_results_dynamut2 import * ######################################################################## # variables - # TODO: add cmd line args gene = 'gid' @@ -26,28 +26,47 @@ datadir = homedir + '/git/Data' indir = datadir + '/' + drug + '/input' outdir = datadir + '/' + drug + '/output' outdir_dynamut = outdir + '/dynamut_results/' +outdir_dynamut2 = outdir + '/dynamut_results/dynamut2/' # Input file infile_dynamut = outdir_dynamut + gene + '_dynamut_all_output_clean.csv' +infile_dynamut2 = outdir_dynamut2 + gene + '_dynamut2_output_combined_clean.csv' # Formatted output filename -outfile_dynamut_f = outdir_dynamut + gene + '_complex_dynamut_norm.csv' +outfile_dynamut_f = outdir_dynamut2 + gene + '_complex_dynamut_norm.csv' +outfile_dynamut2_f = outdir_dynamut2 + gene + '_complex_dynamut2_norm.csv' -#========================== -# CALL: format_results_mcsm_na() -# Data: gid+streptomycin -#========================== -print('Formatting results for:', infile_dynamut) -dynamut_df_f = format_dynamut_output(dynamut_output_csv = infile_dynamut) +#=============================== +# CALL: format_results_dynamut +# DYNAMUT results +# #=============================== +# print('Formatting results for:', infile_dynamut) +# dynamut_df_f = format_dynamut_output(infile_dynamut) +# # writing file +# print('Writing formatted dynamut df to csv') +# dynamut_df_f.to_csv(outfile_dynamut_f, index = False) + +# print('Finished writing file:' +# , '\nFile:', outfile_dynamut_f +# , '\nExpected no. of rows:', len(dynamut_df_f) +# , '\nExpected no. of cols:', len(dynamut_df_f.columns) +# , '\n=============================================================') + +#=============================== +# CALL: format_results_dynamut2 +# DYNAMUT2 results +#=============================== +print('Formatting results for:', infile_dynamut2) +dynamut2_df_f = format_dynamut2_output(infile_dynamut2) # dynamut2 # writing file -print('Writing formatted dynamut df to csv') -dynamut_df_f.to_csv(outfile_dynamut_f, index = False) +print('Writing formatted dynamut2 df to csv') +dynamut2_df_f.to_csv(outfile_dynamut2_f, index = False) print('Finished writing file:' - , '\nFile:', outfile_dynamut_f - , '\nExpected no. of rows:', len(dynamut_df_f) - , '\nExpected no. of cols:', len(dynamut_df_f.columns) + , '\nFile:', outfile_dynamut2_f + , '\nExpected no. of rows:', len(dynamut2_df_f) + , '\nExpected no. of cols:', len(dynamut2_df_f.columns) , '\n=============================================================') #%%##################################################################### \ No newline at end of file diff --git a/dynamut/run_get_results_dynamut.py b/dynamut/run_get_results_dynamut.py index 4ddcc2f..e9e82ef 100755 --- a/dynamut/run_get_results_dynamut.py +++ b/dynamut/run_get_results_dynamut.py @@ -24,20 +24,9 @@ indir = datadir + drug + '/input/' outdir = datadir + drug + '/output/' outdir_dynamut_temp = outdir + 'dynamut_results/dynamut_temp/' #============================================================================== -# batch 8: 08.txt, # RETRIEVED 23 Feb 08:54 -#my_url_file = outdir + '/dynamut_temp/dynamut_result_url_gid_b8.txt' -#my_suffix = 'gid_b7' - -#b09 and b10 failed, ran by Carlos, and returned results on 12 Aug - -# batch 9 and 10: RETRIEVED 12 Aug 09:25 -#my_url_file = outdir_dynamut_temp + 'dynamut_result_url_gid_b10.txt' -#my_suffix = 'gid_b10' - -# batch10_21: from bissection: humour me! (don't need since b10 ran -# from dynamut team, but still its ready to extract it!) RETRIEVED 12 Aug 17:37 -my_url_file = outdir_dynamut_temp + 'dynamut_result_url_gid_b10_21.txt' -my_suffix = 'gid_b10_21' +# batch 7 (previously 1b file): RETRIEVED 17 Aug 16:40 +my_url_file = outdir_dynamut_temp + 'dynamut_result_url_gid_b7.txt' +my_suffix = 'gid_b7' #============================================================================== #========================== @@ -52,4 +41,4 @@ get_results(url_file = my_url_file , output_dir = outdir , outfile_suffix = my_suffix) -######################################################################## +######################################################################## \ No newline at end of file diff --git a/scripts/functions/plotting_data.R b/scripts/functions/plotting_data.R index 105abc8..d3dff8d 100755 --- a/scripts/functions/plotting_data.R +++ b/scripts/functions/plotting_data.R @@ -23,7 +23,7 @@ my_df_u_lig = data.frame() dup_muts = data.frame() #=========================== -# Read file: struct params +# Read file: struct params #=========================== #df = read.csv(infile_params, header = T) diff --git a/scripts/functions/position_count_bp.R b/scripts/functions/position_count_bp.R index 2e1c26c..ce0767c 100755 --- a/scripts/functions/position_count_bp.R +++ b/scripts/functions/position_count_bp.R @@ -30,8 +30,12 @@ site_snp_count_bp <- function (plotdf , axis_text_size = 25 , axis_label_size = 22 , xaxis_title = "Number of nsSNPs" - , yaxis_title = "Number of Sites"){ - + , yaxis_title = "Number of Sites" + , title_colour = "chocolate4" + , subtitle_text = NULL + , subtitle_size = 20 + , subtitle_colour = "pink") + { # dim of plotdf cat(paste0("\noriginal df dimensions:" , "\nNo. of rows:", nrow(plotdf) @@ -83,9 +87,9 @@ site_snp_count_bp <- function (plotdf # FIXME: should really be legend title # but atm being using as plot title - my_leg_title = paste0("Total nsSNPs:", tot_muts - , ", Total no. of nsSNPs sites:", tot_sites) - bp_plot_title = my_leg_title + #my_leg_title + bp_plot_title = paste0("Total nsSNPs: ", tot_muts + , ", Total no. of nsSNPs sites: ", tot_sites) #------------- # start plot 2 @@ -111,11 +115,16 @@ site_snp_count_bp <- function (plotdf #, legend.position = c(0.73,0.8) #, legend.text = element_text(size = leg_text_size) #, legend.title = element_text(size = axis_label_size) - , plot.title = element_text(size = leg_text_size)) + + , plot.title = element_text(size = leg_text_size + , colour = title_colour) + , plot.subtitle = element_text(size = subtitle_size + , hjust = 0.5 + , colour = subtitle_colour)) + labs(title = bp_plot_title - , x = xaxis_title - , y = yaxis_title) + , subtitle = subtitle_text + , x = xaxis_title + , y = yaxis_title) return(OutPlot_pos_count) } diff --git a/scripts/functions/stability_count_bp.R b/scripts/functions/stability_count_bp.R index aa8f5a4..8f7d9ed 100644 --- a/scripts/functions/stability_count_bp.R +++ b/scripts/functions/stability_count_bp.R @@ -22,7 +22,14 @@ stability_count_bp <- function(plotdf , leg_text_size = 20 , leg_title_size = 22 , yaxis_title = "Number of nsSNPs" - , bp_plot_title = ""){ + , bp_plot_title = "" + , label_categories = c("Destabilising", "Stabilising") + , title_colour = "chocolate4" + , subtitle_text = NULL + , subtitle_size = 20 + , subtitle_colour = "pink" + #, leg_position = c(0.73,0.8) # within plot area + , leg_position = "top"){ OutPlot_count = ggplot(plotdf, aes(x = eval(parse(text = df_colname)))) + geom_bar(aes(fill = eval(parse(text = df_colname))), show.legend = TRUE) + @@ -35,14 +42,20 @@ stability_count_bp <- function(plotdf , axis.title.x = element_blank() , axis.title.y = element_text(size = axis_label_size) , axis.text.y = element_text(size = axis_text_size) - , legend.position = c(0.73,0.8) + , legend.position = leg_position , legend.text = element_text(size = leg_text_size) , legend.title = element_text(size = leg_title_size) - , plot.title = element_text(size = axis_label_size)) + - labs(title = bp_plot_title - , y = yaxis_title) + + , plot.title = element_text(size = axis_label_size + , colour = title_colour) + , plot.subtitle = element_text(size = subtitle_size + , hjust = 0.5 + , colour = subtitle_colour)) + + labs(title = bp_plot_title + , subtitle = subtitle_text + , y = yaxis_title) + scale_fill_discrete(name = leg_title - , labels = c("Destabilising", "Stabilising")) + #, labels = c("Destabilising", "Stabilising") + , labels = label_categories) return(OutPlot_count) } diff --git a/scripts/functions/test_bp.R b/scripts/functions/test_bp.R index dc57452..bdb48ca 100644 --- a/scripts/functions/test_bp.R +++ b/scripts/functions/test_bp.R @@ -6,21 +6,24 @@ getwd() # load functions, data, dirs, hardocded vars # that will be used in testing the functions #=========================================== -source("plotting_data.R") -infile = "/home/tanu/git/Data/streptomycin/output/gid_comb_stab_struc_params.csv" +drug = "streptomycin" +gene = "gid" + +source("plotting_data.R") + +infile = paste0("~/git/Data/", drug, "/output/", gene, "_comb_stab_struc_params.csv") +infile_df = read.csv(infile) + +lig_dist = 5 +pd_df = plotting_data(infile_df + , lig_dist_colname = 'ligand_distance' + , lig_dist_cutoff = lig_dist) -pd_df = plotting_data(infile) my_df = pd_df[[1]] my_df_u = pd_df[[2]] my_df_u_lig = pd_df[[3]] dup_muts = pd_df[[4]] -source("plotting_globals.R") -drug = "streptomycin" -gene = "gid" - -import_dirs(drug, gene) - #===================== # functions to test #===================== @@ -40,7 +43,9 @@ print(paste0("plot filename:", basic_bp_duet)) # function only stability_count_bp(plotdf = my_df_u , df_colname = "duet_outcome" - , leg_title = "DUET outcome") + , leg_title = "DUET outcome" + , label_categories = c("Destabilising", "Stabilising") + , leg_position = "top") dev.off() @@ -54,10 +59,13 @@ svg(plot_basic_bp_ligand) print(paste0("plot filename:", basic_bp_ligand)) # function only +lig_dist = 10 stability_count_bp(plotdf = my_df_u_lig , df_colname = "ligand_outcome" , leg_title = "Ligand outcome" - , bp_plot_title = "Sites < 10 Ang of ligand") + , yaxis_title = paste0("Number of nsSNPs\nLigand dist: <", lig_dist, "\u212b") + #, bp_plot_title = "Sites < 10 Ang of ligand" + ) dev.off() # ------------------------------ diff --git a/scripts/plotting/redundant/basic_barplots_PS.R b/scripts/plotting/redundant/basic_barplots_PS.R index 20f8457..d6b4530 100755 --- a/scripts/plotting/redundant/basic_barplots_PS.R +++ b/scripts/plotting/redundant/basic_barplots_PS.R @@ -103,7 +103,7 @@ cat(paste0("Directories imported:" cat(paste0("\nVariables imported:" , "\ndrug:", drug , "\ngene:", gene - , "\n)) + , "\n")) #, "\ngene_match:", gene_match #, "\nLength of upos:", length(upos) #, "\nAngstrom symbol:", angstroms_symbol))