From e0f14ed266a6891de17af672767eba8da23fd75c Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Wed, 26 Aug 2020 16:39:10 +0100 Subject: [PATCH] sorted subcols_axis script to generate correct axis cols for both PS and lig plots --- scripts/plotting/Header_TT.R | 0 scripts/plotting/barplot_colour_function.R | 0 scripts/plotting/barplots_subcolours_PS.R | 0 scripts/plotting/barplots_subcolours_aa_PS.R | 47 +++--- scripts/plotting/basic_barplots_PS.R | 2 +- scripts/plotting/combining_two_df_FIXME.R | 0 scripts/plotting/mcsm_mean_stability.R | 0 scripts/plotting/plotting_data.R | 2 +- scripts/plotting/subcols_axis_PS.R | 147 ++++++++++++------- 9 files changed, 117 insertions(+), 81 deletions(-) mode change 100644 => 100755 scripts/plotting/Header_TT.R mode change 100644 => 100755 scripts/plotting/barplot_colour_function.R mode change 100644 => 100755 scripts/plotting/barplots_subcolours_PS.R mode change 100644 => 100755 scripts/plotting/barplots_subcolours_aa_PS.R mode change 100644 => 100755 scripts/plotting/basic_barplots_PS.R mode change 100644 => 100755 scripts/plotting/combining_two_df_FIXME.R mode change 100644 => 100755 scripts/plotting/mcsm_mean_stability.R mode change 100644 => 100755 scripts/plotting/plotting_data.R mode change 100644 => 100755 scripts/plotting/subcols_axis_PS.R diff --git a/scripts/plotting/Header_TT.R b/scripts/plotting/Header_TT.R old mode 100644 new mode 100755 diff --git a/scripts/plotting/barplot_colour_function.R b/scripts/plotting/barplot_colour_function.R old mode 100644 new mode 100755 diff --git a/scripts/plotting/barplots_subcolours_PS.R b/scripts/plotting/barplots_subcolours_PS.R old mode 100644 new mode 100755 diff --git a/scripts/plotting/barplots_subcolours_aa_PS.R b/scripts/plotting/barplots_subcolours_aa_PS.R old mode 100644 new mode 100755 index 84b5811..5850abf --- a/scripts/plotting/barplots_subcolours_aa_PS.R +++ b/scripts/plotting/barplots_subcolours_aa_PS.R @@ -1,3 +1,4 @@ +#!/usr/bin/env Rscript getwd() setwd("~/git/LSHTM_analysis/scripts/plotting") getwd() @@ -42,14 +43,30 @@ cat(paste0("Variables imported:" , "\nAngstrom symbol:", angstroms_symbol)) # clear excess variable -rm(my_df, upos, dup_muts, my_df_u_lig) +rm(dup_muts_cols, mut_pos_cols_lig, my_df_cols, my_df_u_cols_lig, upos) + #======================================================================= +# !!! very important!!!! #================ # Inspecting mut_pos_cols -# position numbers and colours +# position numbers and colours and assigning axis colours based on lab_fg +# of the correct df # open file from desktop ("sample_axis_cols") for cross checking #================ +# very important! +#my_axis_colours = mut_pos_cols$lab_fg +if ( nrow(mut_pos_cols) == length(unique(my_df_u_cols$position)) ){ + print("PASS: lengths checked, assigning axis colours") + my_axis_colours = mut_pos_cols$lab_fg + cat("length of axis colours:", length(my_axis_colours) + , "\nwhich corresponds to the number of positions on the x-axis of the plot") +}else{ + print("FAIL:lengths mismatch, could not assign axis colours") + quit() +} + +# further sanity checks table(mut_pos_cols$lab_bg) check_lab_bg = sum( table(mut_pos_cols$lab_bg) ) == nrow(mut_pos_cols) # should be True check_lab_bg @@ -70,12 +87,6 @@ if (check_lab_bg && check_lab_bg2 && check_lab_fg) { quit() } -# very important! -my_axis_colours = mut_pos_cols$lab_fg - -# now clear mut_pos_cols -rm(mut_pos_cols) - #======= # output #======= @@ -89,13 +100,13 @@ plot_bp_aa_subcols_duet = paste0(plotdir, "/", bp_aa_subcols_duet) # Data for plots #================ # REASSIGNMENT as necessary -df = my_df_u +df = my_df_u_cols # sanity checks str(df) ########################### -# 2: Plot: DUET scores +# Plot: DUET scores ########################### #========================== @@ -137,7 +148,7 @@ snp_count = sort(unique(snpsBYpos_df$snpsBYpos)) if (is.factor(df$duet_outcome)){ print("duet_outcome is factor") }else{ - print("convert duet_outcome to factor") + print("converting duet_outcome to factor") df$duet_outcome = as.factor(df$duet_outcome) } @@ -165,25 +176,17 @@ tapply(df$duet_scaled, df$duet_outcome, max) u = unique(df$duet_scaled) cat("No. of unique values in normalised data:", length(u)) -#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -# Run this section if rounding is to be used -# specify number for rounding -#n = 3 -#df$duet_scaledR = round(df$duet_scaled, n) -#ur = unique(df$duet_scaledR) - -# create an extra column called group which contains the "gp name and score" +# Define group +# Create an extra column called group which contains the "gp name and score" # so colours can be generated for each unique values in this column - -#my_grp = df$duet_scaledR # rounding my_grp = df$duet_scaled # no rounding -#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% df$group <- paste0(df$duet_outcome, "_", my_grp, sep = "") # Call the function to create the palette based on the group defined above colours <- ColourPalleteMulti(df, "duet_outcome", "my_grp") print(paste0("Colour palette generated for: ", length(colours), " colours")) my_title = "Protein stability (DUET)" +cat("No. of axis colours: ", length(my_axis_colours)) #======================== # plot with axis colours diff --git a/scripts/plotting/basic_barplots_PS.R b/scripts/plotting/basic_barplots_PS.R old mode 100644 new mode 100755 index c28332b..47d34d9 --- a/scripts/plotting/basic_barplots_PS.R +++ b/scripts/plotting/basic_barplots_PS.R @@ -32,7 +32,7 @@ cat(paste0("Directories imported:" , "\ndrug:", drug , "\ngene:", gene , "\ngene_match:", gene_match - , "\nLength of upos:", length(upos)) + , "\nLength of upos:", length(upos) , "\nAngstrom symbol:", angstroms_symbol)) # clear excess variable diff --git a/scripts/plotting/combining_two_df_FIXME.R b/scripts/plotting/combining_two_df_FIXME.R old mode 100644 new mode 100755 diff --git a/scripts/plotting/mcsm_mean_stability.R b/scripts/plotting/mcsm_mean_stability.R old mode 100644 new mode 100755 diff --git a/scripts/plotting/plotting_data.R b/scripts/plotting/plotting_data.R old mode 100644 new mode 100755 index b9f2197..291579e --- a/scripts/plotting/plotting_data.R +++ b/scripts/plotting/plotting_data.R @@ -67,7 +67,7 @@ cat("\nInput dimensions:", dim(my_df)) #str(my_df) ########################### -# extract unique mutations +# extract unique mutation entries ########################### # check for duplicate mutations diff --git a/scripts/plotting/subcols_axis_PS.R b/scripts/plotting/subcols_axis_PS.R old mode 100644 new mode 100755 index 5fe0ab3..867d8bd --- a/scripts/plotting/subcols_axis_PS.R +++ b/scripts/plotting/subcols_axis_PS.R @@ -1,7 +1,6 @@ ######################################################### -# TASK: Adding colours to positions labels according to -# active site residues. This is so these can be seen promptly -# when visualising the barplot. +# TASK: Adding colours to dfs so they can be used for plotting +# add cols to each of the my_df* dfs ######################################################### #======================================================================= getwd() @@ -15,46 +14,45 @@ source("plotting_data.R") # my_df_u_lig # dup_muts +cat(paste0("Directories imported:" + , "\ndatadir:", datadir + , "\nindir:", indir + , "\noutdir:", outdir + , "\nplotdir:", plotdir)) + +cat(paste0("Variables imported:" + , "\ndrug:", drug + , "\ngene:", gene + , "\ngene_match:", gene_match + , "\nLength of upos:", length(upos) + , "\nAngstrom symbol:", angstroms_symbol)) + +# clear excess variable +rm(upos, dup_muts, my_df_u, my_df_u_lig) + +# This is because we want to assign the colours to my_df +# and then resubset accordingly for our plots to avoid multiple merges + #======================================================================= -########################### -# Read file: struct params -########################### -cat("Reading struct params including mcsm:", in_filename_params) +# df to use: my_df +# NOTE: my_df contains duplicate muts but its ok as you are only adding +# colours to positions -my_df = read.csv(infile_params - #, stringsAsFactors = F - , header = T) +# sanity checks: ensure my_df is ordered by position: it should be +my_df$position; my_df$mutationinformation -cat("Input dimensions:", dim(my_df)) +my_df_o = my_df[order(my_df$position),] +my_df_o$position; my_df_o$mutationinformation -# clear variables -rm(in_filename_params, infile_params) +head(my_df_o$position) == head(my_df$position) +head(my_df_o$mutationinformation) == head(my_df$mutationinformation) +tail(my_df_o$position) == tail(my_df$position) +tail(my_df_o$mutationinformation) == tail(my_df$mutationinformation) -# quick checks -colnames(my_df) -str(my_df) +my_df = my_df_o -# check for duplicate mutations -if ( length(unique(my_df$mutationinformation)) != length(my_df$mutationinformation)){ - cat(paste0("CAUTION:", " Duplicate mutations identified" - , "\nExtracting these...")) - dup_muts = my_df[duplicated(my_df$mutationinformation),] - dup_muts_nu = length(unique(dup_muts$mutationinformation)) - cat(paste0("\nDim of duplicate mutation df:", nrow(dup_muts) - , "\nNo. of unique duplicate mutations:", dup_muts_nu - , "\n\nExtracting df with unique mutations only")) - my_df_u = my_df[!duplicated(my_df$mutationinformation),] -}else{ - cat(paste0("No duplicate mutations detected")) - my_df_u = my_df -} - -upos = unique(my_df_u$position) -cat("Dim of clean df:"); cat(dim(my_df_u)) -cat("\nNo. of unique mutational positions:"); cat(length(upos)) -#======================================================================= # create a new df with unique position numbers and cols -position = unique(my_df$position) #130 +position = unique(my_df$position) position_cols = as.data.frame(position) head(position_cols) ; tail(position_cols) @@ -143,6 +141,7 @@ mut_pos_cols = merge(position_cols, aa_cols_ref , all.x = TRUE) head(mut_pos_cols) + # replace NA"s # :column "lab_bg" with "white" # : column "lab_fg" with "black" @@ -165,39 +164,69 @@ head(df0$position); tail(df0$position) head(df1$position); tail(df1$position) # should now have 3 extra columns -my_df = merge(df0, df1 +my_df_cols = merge(df0, df1 , by = "position" , all.x = TRUE) # sanity check -my_df[my_df$position == "49",] -my_df[my_df$position == "13",] +my_df_cols[my_df_cols$position == "49",] +my_df_cols[my_df_cols$position == "13",] -rm(df0, df1) -#=========== -# Merge 3: Merge mut_pos_cols with mcsm df_u -# Now combined the positions with aa colours with -# the mcsm_data -#=========== -# dfs to merge -df0 = my_df_u # my_df_u -df1 = mut_pos_cols +########################### +# extract unique mutation entries +########################### -# check the column on which merge will be performed -head(df0$position); tail(df0$position) -head(df1$position); tail(df1$position) +# check for duplicate mutations +if ( length(unique(my_df_cols$mutationinformation)) != length(my_df_cols$mutationinformation)){ + cat(paste0("\nCAUTION:", " Duplicate mutations identified" + , "\nExtracting these...")) + dup_muts_cols = my_df_cols[duplicated(my_df_cols$mutationinformation),] + dup_muts_cols_nu = length(unique(dup_muts_cols$mutationinformation)) + cat(paste0("\nDim of duplicate mutation df:", nrow(dup_muts_cols) + , "\nNo. of unique duplicate mutations:", dup_muts_cols_nu + , "\n\nExtracting df with unique mutations only")) + my_df_u_cols = my_df_cols[!duplicated(my_df_cols$mutationinformation),] +}else{ + cat(paste0("\nNo duplicate mutations detected")) + my_df_u_cols = my_df_cols +} + +upos = unique(my_df_u_cols$position) +cat("\nDim of clean df:"); cat(dim(my_df_u_cols)) +cat("\nNo. of unique mutational positions:"); cat(length(upos), "\n") -# should now have 3 extra columns -my_df_u = merge(df0, df1 - , by = "position" - , all.x = TRUE) # sanity check -my_df[my_df$position == "49",] -my_df[my_df$position == "13",] +my_df_u_cols[my_df_u_cols$position == "49",] +my_df_u_cols[my_df_u_cols$position == "13",] +my_df_u_cols[my_df_u_cols$position == "103",] +########################### +# extract mutations <10Angstroms +########################### +table(my_df_u_cols$ligand_distance<10) + +my_df_u_cols_lig = my_df_u_cols[my_df_u_cols$ligand_distance <10,] +angstroms_symbol = "\u212b" +cat(paste0("There are ", nrow(my_df_u_cols_lig), " sites lying within 10", angstroms_symbol, " of the ligand")) + +#================= +# very important! +#================= +#my_axis_colours = mut_pos_cols$lab_fg # doesn't work if positions numbers are subsetted as in ligand +# need the equivalent of the mut_pos_cols for ligand + +# get position numbers for ligand +lig_pos = my_df_u_cols_lig$position + +# subset mut_pos_cols for ligand positions +mut_pos_cols_lig = mut_pos_cols[mut_pos_cols$position %in% lig_pos,] +#my_axis_colours = mut_pos_cols_lig$lab_fg + +#==================================================================== # clear variables rm(aa_cols_ref + , my_df , df0 , df1 , position_cols @@ -206,3 +235,7 @@ rm(aa_cols_ref , lab_fg , position) +####################################################################### +# end of script +####################################################################### +