adde format_results_dynamut2.py and ran shiny scripts for barplots
This commit is contained in:
parent
8cdf720702
commit
f7aac58081
9 changed files with 235 additions and 59 deletions
|
@ -123,7 +123,7 @@ def format_dynamut_output(dynamut_output_csv):
|
|||
# reorder columns
|
||||
#############
|
||||
dynamut_data.columns
|
||||
dynamut_dataf = dynamut_data[['mutationinformation'
|
||||
dynamut_data_f = dynamut_data[['mutationinformation'
|
||||
|
||||
, 'ddg_dynamut'
|
||||
, 'ddg_dynamut_scaled'
|
||||
|
@ -149,13 +149,14 @@ def format_dynamut_output(dynamut_output_csv):
|
|||
, 'dds_encom_scaled'
|
||||
, 'dds_encom_outcome']]
|
||||
|
||||
if len(dynamut_data.columns) == len(dynamut_dataf):
|
||||
if len(dynamut_data.columns) == len(dynamut_data_f.columns) and sorted(dynamut_data.columns) == sorted(dynamut_data_f.columns):
|
||||
print('\nPASS: outcome_classification, scaling and column reordering completed')
|
||||
else:
|
||||
print('\nFAIL: Something went wrong...'
|
||||
, '\nExpected length: ', len(dynamut_data.columns)
|
||||
, '\nGot: ', len(dynamut_dataf))
|
||||
, '\nGot: ', len(dynamut_data_f.columns))
|
||||
sys.exit()
|
||||
|
||||
return(dynamut_dataf)
|
||||
return(dynamut_data_f)
|
||||
#%%#####################################################################
|
||||
|
||||
|
|
137
dynamut/format_results_dynamut2.py
Normal file
137
dynamut/format_results_dynamut2.py
Normal file
|
@ -0,0 +1,137 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Wed Aug 19 14:33:51 2020
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
#%% load packages
|
||||
import os,sys
|
||||
import subprocess
|
||||
import argparse
|
||||
import requests
|
||||
import re
|
||||
import time
|
||||
from bs4 import BeautifulSoup
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from pandas.api.types import is_string_dtype
|
||||
from pandas.api.types import is_numeric_dtype
|
||||
#%%#####################################################################
|
||||
def format_dynamut2_output(dynamut_output_csv):
|
||||
"""
|
||||
@param dynamut_output_csv: file containing dynamut2 results for all muts
|
||||
which is the result of combining all dynamut2_output batch results, and using
|
||||
bash scripts to combine all the batch results into one file.
|
||||
Dynamut2ran manually from batches
|
||||
Formatting df to a pandas df and output as csv.
|
||||
@type string
|
||||
|
||||
@return (not true) formatted csv for dynamut output
|
||||
@type pandas df
|
||||
|
||||
"""
|
||||
#############
|
||||
# Read file
|
||||
#############
|
||||
dynamut_data_raw = pd.read_csv(dynamut_output_csv, sep = ',')
|
||||
|
||||
# strip white space from both ends in all columns
|
||||
dynamut_data = dynamut_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
|
||||
|
||||
dforig_shape = dynamut_data.shape
|
||||
print('dimensions of input file:', dforig_shape)
|
||||
|
||||
#%%============================================================================
|
||||
#####################################
|
||||
# create binary cols for ddg_dynamut2
|
||||
# >=0: Stabilising
|
||||
######################################
|
||||
outcome_cols = ['ddg_dynamut2']
|
||||
|
||||
# col test: ddg_dynamut
|
||||
#len(dynamut_data[dynamut_data['ddg_dynamut'] >= 0])
|
||||
#dynamut_data['ddg_dynamut_outcome'] = dynamut_data['ddg_dynamut'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
|
||||
#len(dynamut_data[dynamut_data['ddg_dynamut_outcome'] == 'Stabilising'])
|
||||
|
||||
print('\nCreating classification cols for', len(outcome_cols), 'columns'
|
||||
, '\nThese are:')
|
||||
|
||||
for cols in outcome_cols:
|
||||
print(cols)
|
||||
|
||||
tot_muts = dynamut_data[cols].count()
|
||||
print('\nTotal entries:', tot_muts)
|
||||
|
||||
outcome_colname = cols + '_outcome'
|
||||
print(cols, ':', outcome_colname)
|
||||
c1 = len(dynamut_data[dynamut_data[cols] >= 0])
|
||||
dynamut_data[outcome_colname] = dynamut_data[cols].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
|
||||
c2 = len(dynamut_data[dynamut_data[outcome_colname] == 'Stabilising'])
|
||||
if c1 == c2:
|
||||
print('\nPASS: outcome classification column created successfully'
|
||||
, '\nColumn created:', outcome_colname
|
||||
#, '\nNo. of stabilising muts: ', c1
|
||||
#, '\nNo. of DEstabilising muts: ', tot_muts-c1
|
||||
, '\n\nCateg counts:\n', dynamut_data[outcome_colname].value_counts() )
|
||||
|
||||
else:
|
||||
print('\nFAIL: outcome classification numbers MISmatch'
|
||||
, '\nexpected length:', c1
|
||||
, '\nGot:', c2)
|
||||
|
||||
#%%=====================================================================
|
||||
################################
|
||||
# scale all ddg_dynamut2 values
|
||||
#################################
|
||||
# Rescale values in all ddg_dynamut2 col col b/w -1 and 1 so negative numbers
|
||||
# stay neg and pos numbers stay positive
|
||||
|
||||
outcome_cols = ['ddg_dynamut2']
|
||||
|
||||
for cols in outcome_cols:
|
||||
#print(cols)
|
||||
col_max = dynamut_data[cols].max()
|
||||
col_min = dynamut_data[cols].min()
|
||||
print( '\n===================='
|
||||
, '\nColname:', cols
|
||||
, '\n===================='
|
||||
, '\nMax: ', col_max
|
||||
, '\nMin: ', col_min)
|
||||
|
||||
scaled_colname = cols + '_scaled'
|
||||
print('\nCreated scaled colname for', cols, ':', scaled_colname)
|
||||
col_scale = lambda x : x/abs(col_min) if x < 0 else (x/col_max if x >= 0 else 'failed')
|
||||
|
||||
dynamut_data[scaled_colname] = dynamut_data[cols].apply(col_scale)
|
||||
|
||||
col_scaled_max = dynamut_data[scaled_colname].max()
|
||||
col_scaled_min = dynamut_data[scaled_colname].min()
|
||||
print( '\n===================='
|
||||
, '\nColname:', scaled_colname
|
||||
, '\n===================='
|
||||
, '\nMax: ', col_scaled_max
|
||||
, '\nMin: ', col_scaled_min)
|
||||
|
||||
#%%=====================================================================
|
||||
#############
|
||||
# reorder columns
|
||||
#############
|
||||
dynamut_data.columns
|
||||
dynamut_data_f = dynamut_data[['mutationinformation'
|
||||
, 'chain'
|
||||
, 'ddg_dynamut2'
|
||||
, 'ddg_dynamut2_scaled'
|
||||
, 'ddg_dynamut2_outcome']]
|
||||
|
||||
if len(dynamut_data.columns) == len(dynamut_data_f.columns) and sorted(dynamut_data.columns) == sorted(dynamut_data_f.columns):
|
||||
print('\nPASS: outcome_classification, scaling and column reordering completed')
|
||||
else:
|
||||
print('\nFAIL: Something went wrong...'
|
||||
, '\nExpected length: ', len(dynamut_data.columns)
|
||||
, '\nGot: ', len(dynamut_data_f.columns))
|
||||
sys.exit()
|
||||
|
||||
return(dynamut_data_f)
|
||||
#%%#####################################################################
|
||||
|
|
@ -15,9 +15,9 @@ import os
|
|||
homedir = os.path.expanduser('~')
|
||||
os.chdir (homedir + '/git/LSHTM_analysis/dynamut')
|
||||
from format_results_dynamut import *
|
||||
from format_results_dynamut2 import *
|
||||
########################################################################
|
||||
# variables
|
||||
|
||||
# TODO: add cmd line args
|
||||
|
||||
gene = 'gid'
|
||||
|
@ -26,28 +26,47 @@ datadir = homedir + '/git/Data'
|
|||
indir = datadir + '/' + drug + '/input'
|
||||
outdir = datadir + '/' + drug + '/output'
|
||||
outdir_dynamut = outdir + '/dynamut_results/'
|
||||
outdir_dynamut2 = outdir + '/dynamut_results/dynamut2/'
|
||||
|
||||
# Input file
|
||||
infile_dynamut = outdir_dynamut + gene + '_dynamut_all_output_clean.csv'
|
||||
infile_dynamut2 = outdir_dynamut2 + gene + '_dynamut2_output_combined_clean.csv'
|
||||
|
||||
# Formatted output filename
|
||||
outfile_dynamut_f = outdir_dynamut + gene + '_complex_dynamut_norm.csv'
|
||||
outfile_dynamut_f = outdir_dynamut2 + gene + '_complex_dynamut_norm.csv'
|
||||
outfile_dynamut2_f = outdir_dynamut2 + gene + '_complex_dynamut2_norm.csv'
|
||||
|
||||
#==========================
|
||||
# CALL: format_results_mcsm_na()
|
||||
# Data: gid+streptomycin
|
||||
#==========================
|
||||
print('Formatting results for:', infile_dynamut)
|
||||
dynamut_df_f = format_dynamut_output(dynamut_output_csv = infile_dynamut)
|
||||
#===============================
|
||||
# CALL: format_results_dynamut
|
||||
# DYNAMUT results
|
||||
# #===============================
|
||||
# print('Formatting results for:', infile_dynamut)
|
||||
# dynamut_df_f = format_dynamut_output(infile_dynamut)
|
||||
# # writing file
|
||||
# print('Writing formatted dynamut df to csv')
|
||||
# dynamut_df_f.to_csv(outfile_dynamut_f, index = False)
|
||||
|
||||
# print('Finished writing file:'
|
||||
# , '\nFile:', outfile_dynamut_f
|
||||
# , '\nExpected no. of rows:', len(dynamut_df_f)
|
||||
# , '\nExpected no. of cols:', len(dynamut_df_f.columns)
|
||||
# , '\n=============================================================')
|
||||
|
||||
#===============================
|
||||
# CALL: format_results_dynamut2
|
||||
# DYNAMUT2 results
|
||||
#===============================
|
||||
print('Formatting results for:', infile_dynamut2)
|
||||
dynamut2_df_f = format_dynamut2_output(infile_dynamut2) # dynamut2
|
||||
|
||||
# writing file
|
||||
print('Writing formatted dynamut df to csv')
|
||||
dynamut_df_f.to_csv(outfile_dynamut_f, index = False)
|
||||
print('Writing formatted dynamut2 df to csv')
|
||||
dynamut2_df_f.to_csv(outfile_dynamut2_f, index = False)
|
||||
|
||||
print('Finished writing file:'
|
||||
, '\nFile:', outfile_dynamut_f
|
||||
, '\nExpected no. of rows:', len(dynamut_df_f)
|
||||
, '\nExpected no. of cols:', len(dynamut_df_f.columns)
|
||||
, '\nFile:', outfile_dynamut2_f
|
||||
, '\nExpected no. of rows:', len(dynamut2_df_f)
|
||||
, '\nExpected no. of cols:', len(dynamut2_df_f.columns)
|
||||
, '\n=============================================================')
|
||||
|
||||
#%%#####################################################################
|
|
@ -24,20 +24,9 @@ indir = datadir + drug + '/input/'
|
|||
outdir = datadir + drug + '/output/'
|
||||
outdir_dynamut_temp = outdir + 'dynamut_results/dynamut_temp/'
|
||||
#==============================================================================
|
||||
# batch 8: 08.txt, # RETRIEVED 23 Feb 08:54
|
||||
#my_url_file = outdir + '/dynamut_temp/dynamut_result_url_gid_b8.txt'
|
||||
#my_suffix = 'gid_b7'
|
||||
|
||||
#b09 and b10 failed, ran by Carlos, and returned results on 12 Aug
|
||||
|
||||
# batch 9 and 10: RETRIEVED 12 Aug 09:25
|
||||
#my_url_file = outdir_dynamut_temp + 'dynamut_result_url_gid_b10.txt'
|
||||
#my_suffix = 'gid_b10'
|
||||
|
||||
# batch10_21: from bissection: humour me! (don't need since b10 ran
|
||||
# from dynamut team, but still its ready to extract it!) RETRIEVED 12 Aug 17:37
|
||||
my_url_file = outdir_dynamut_temp + 'dynamut_result_url_gid_b10_21.txt'
|
||||
my_suffix = 'gid_b10_21'
|
||||
# batch 7 (previously 1b file): RETRIEVED 17 Aug 16:40
|
||||
my_url_file = outdir_dynamut_temp + 'dynamut_result_url_gid_b7.txt'
|
||||
my_suffix = 'gid_b7'
|
||||
#==============================================================================
|
||||
|
||||
#==========================
|
||||
|
|
|
@ -30,8 +30,12 @@ site_snp_count_bp <- function (plotdf
|
|||
, axis_text_size = 25
|
||||
, axis_label_size = 22
|
||||
, xaxis_title = "Number of nsSNPs"
|
||||
, yaxis_title = "Number of Sites"){
|
||||
|
||||
, yaxis_title = "Number of Sites"
|
||||
, title_colour = "chocolate4"
|
||||
, subtitle_text = NULL
|
||||
, subtitle_size = 20
|
||||
, subtitle_colour = "pink")
|
||||
{
|
||||
# dim of plotdf
|
||||
cat(paste0("\noriginal df dimensions:"
|
||||
, "\nNo. of rows:", nrow(plotdf)
|
||||
|
@ -83,9 +87,9 @@ site_snp_count_bp <- function (plotdf
|
|||
|
||||
# FIXME: should really be legend title
|
||||
# but atm being using as plot title
|
||||
my_leg_title = paste0("Total nsSNPs:", tot_muts
|
||||
#my_leg_title
|
||||
bp_plot_title = paste0("Total nsSNPs: ", tot_muts
|
||||
, ", Total no. of nsSNPs sites: ", tot_sites)
|
||||
bp_plot_title = my_leg_title
|
||||
|
||||
#-------------
|
||||
# start plot 2
|
||||
|
@ -111,9 +115,14 @@ site_snp_count_bp <- function (plotdf
|
|||
#, legend.position = c(0.73,0.8)
|
||||
#, legend.text = element_text(size = leg_text_size)
|
||||
#, legend.title = element_text(size = axis_label_size)
|
||||
, plot.title = element_text(size = leg_text_size)) +
|
||||
, plot.title = element_text(size = leg_text_size
|
||||
, colour = title_colour)
|
||||
, plot.subtitle = element_text(size = subtitle_size
|
||||
, hjust = 0.5
|
||||
, colour = subtitle_colour)) +
|
||||
|
||||
labs(title = bp_plot_title
|
||||
, subtitle = subtitle_text
|
||||
, x = xaxis_title
|
||||
, y = yaxis_title)
|
||||
|
||||
|
|
|
@ -22,7 +22,14 @@ stability_count_bp <- function(plotdf
|
|||
, leg_text_size = 20
|
||||
, leg_title_size = 22
|
||||
, yaxis_title = "Number of nsSNPs"
|
||||
, bp_plot_title = ""){
|
||||
, bp_plot_title = ""
|
||||
, label_categories = c("Destabilising", "Stabilising")
|
||||
, title_colour = "chocolate4"
|
||||
, subtitle_text = NULL
|
||||
, subtitle_size = 20
|
||||
, subtitle_colour = "pink"
|
||||
#, leg_position = c(0.73,0.8) # within plot area
|
||||
, leg_position = "top"){
|
||||
|
||||
OutPlot_count = ggplot(plotdf, aes(x = eval(parse(text = df_colname)))) +
|
||||
geom_bar(aes(fill = eval(parse(text = df_colname))), show.legend = TRUE) +
|
||||
|
@ -35,14 +42,20 @@ stability_count_bp <- function(plotdf
|
|||
, axis.title.x = element_blank()
|
||||
, axis.title.y = element_text(size = axis_label_size)
|
||||
, axis.text.y = element_text(size = axis_text_size)
|
||||
, legend.position = c(0.73,0.8)
|
||||
, legend.position = leg_position
|
||||
, legend.text = element_text(size = leg_text_size)
|
||||
, legend.title = element_text(size = leg_title_size)
|
||||
, plot.title = element_text(size = axis_label_size)) +
|
||||
, plot.title = element_text(size = axis_label_size
|
||||
, colour = title_colour)
|
||||
, plot.subtitle = element_text(size = subtitle_size
|
||||
, hjust = 0.5
|
||||
, colour = subtitle_colour)) +
|
||||
labs(title = bp_plot_title
|
||||
, subtitle = subtitle_text
|
||||
, y = yaxis_title) +
|
||||
scale_fill_discrete(name = leg_title
|
||||
, labels = c("Destabilising", "Stabilising"))
|
||||
#, labels = c("Destabilising", "Stabilising")
|
||||
, labels = label_categories)
|
||||
|
||||
return(OutPlot_count)
|
||||
}
|
||||
|
|
|
@ -6,21 +6,24 @@ getwd()
|
|||
# load functions, data, dirs, hardocded vars
|
||||
# that will be used in testing the functions
|
||||
#===========================================
|
||||
source("plotting_data.R")
|
||||
infile = "/home/tanu/git/Data/streptomycin/output/gid_comb_stab_struc_params.csv"
|
||||
drug = "streptomycin"
|
||||
gene = "gid"
|
||||
|
||||
source("plotting_data.R")
|
||||
|
||||
infile = paste0("~/git/Data/", drug, "/output/", gene, "_comb_stab_struc_params.csv")
|
||||
infile_df = read.csv(infile)
|
||||
|
||||
lig_dist = 5
|
||||
pd_df = plotting_data(infile_df
|
||||
, lig_dist_colname = 'ligand_distance'
|
||||
, lig_dist_cutoff = lig_dist)
|
||||
|
||||
pd_df = plotting_data(infile)
|
||||
my_df = pd_df[[1]]
|
||||
my_df_u = pd_df[[2]]
|
||||
my_df_u_lig = pd_df[[3]]
|
||||
dup_muts = pd_df[[4]]
|
||||
|
||||
source("plotting_globals.R")
|
||||
drug = "streptomycin"
|
||||
gene = "gid"
|
||||
|
||||
import_dirs(drug, gene)
|
||||
|
||||
#=====================
|
||||
# functions to test
|
||||
#=====================
|
||||
|
@ -40,7 +43,9 @@ print(paste0("plot filename:", basic_bp_duet))
|
|||
# function only
|
||||
stability_count_bp(plotdf = my_df_u
|
||||
, df_colname = "duet_outcome"
|
||||
, leg_title = "DUET outcome")
|
||||
, leg_title = "DUET outcome"
|
||||
, label_categories = c("Destabilising", "Stabilising")
|
||||
, leg_position = "top")
|
||||
|
||||
dev.off()
|
||||
|
||||
|
@ -54,10 +59,13 @@ svg(plot_basic_bp_ligand)
|
|||
print(paste0("plot filename:", basic_bp_ligand))
|
||||
|
||||
# function only
|
||||
lig_dist = 10
|
||||
stability_count_bp(plotdf = my_df_u_lig
|
||||
, df_colname = "ligand_outcome"
|
||||
, leg_title = "Ligand outcome"
|
||||
, bp_plot_title = "Sites < 10 Ang of ligand")
|
||||
, yaxis_title = paste0("Number of nsSNPs\nLigand dist: <", lig_dist, "\u212b")
|
||||
#, bp_plot_title = "Sites < 10 Ang of ligand"
|
||||
)
|
||||
|
||||
dev.off()
|
||||
# ------------------------------
|
||||
|
|
|
@ -103,7 +103,7 @@ cat(paste0("Directories imported:"
|
|||
cat(paste0("\nVariables imported:"
|
||||
, "\ndrug:", drug
|
||||
, "\ngene:", gene
|
||||
, "\n))
|
||||
, "\n"))
|
||||
#, "\ngene_match:", gene_match
|
||||
#, "\nLength of upos:", length(upos)
|
||||
#, "\nAngstrom symbol:", angstroms_symbol))
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue