max(my_df2$Dis_lig_Ang) #9.847 min(my_df2$Dis_lig_Ang) #3.047 # count no of unique positions length(unique(my_df2$Position))#47 #count no of unique mutations length(unique(my_df2$Mutationinformation)) #110 #clear variable to avoid confusion rm(my_df) #%%%%%%%%%% # Reassign #%%%%%%%%%% my_df = my_df2 # Stacked Barplot with colours: Lig_outcome @ position coloured by # stability scores. This is a barplot where each bar corresponds # to a SNP and is coloured by its corresponding PredAff stability value. # Normalised values (range between -1 and 1 ) to aid visualisation # NOTE: since barplot plots discrete values, colour = score, so number of # colours will be equal to the no. of unique normalised scores # rather than a continuous scale # will require generating the colour scale separately. #============================ # sanity checks upos = unique(my_df$Position) str(my_df$Lig_outcome) colnames(my_df) #=========================== # Data preparation for plots #=========================== #!!!!!!!!!!!!!!!!! # REASSIGNMENT df <- my_df rm(my_df) # sanity checks # should be a factor is.factor(df$Lig_outcome) table(df$Lig_outcome) # sanity checks # should be a factor is.factor(df$Lig_outcome); as.factor(df$Lig_outcome) # sanity checks # should be a factor is.factor(df$Lig_outcome); as.factor(df$Lig_outcome) #=========================== # Data preparation for plots #=========================== #!!!!!!!!!!!!!!!!! # REASSIGNMENT df <- my_df getwd() setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") getwd() ############################################################ # 1: Installing and loading required packages and functions ############################################################ source("../Header_TT.R") source("../barplot_colour_function.R") ############################################################ # Output dir for plots ############################################################ out_dir = "~/git/Data/pyrazinamide/output/plots" ############################################################ # 2: call script the prepares the data with columns containing # colours for axis labels ############################################################ source("subcols_axis.R") # this should return #mut_pos_cols: 130, 4 #my_df: 335, 39 # clear excess variable # "mut_pos_cols" is just for inspection in case you need to cross check # position numbers and colours # open file from deskptop ("sample_axis_cols") for cross checking table(mut_pos_cols$lab_bg) sum( table(mut_pos_cols$lab_bg) ) == nrow(mut_pos_cols) # should be True table(mut_pos_cols$lab_bg2) sum( table(mut_pos_cols$lab_bg2) ) == nrow(mut_pos_cols) # should be True table(mut_pos_cols$lab_fg) sum( table(mut_pos_cols$lab_fg) ) == nrow(mut_pos_cols) # should be True # very important! my_axis_colours = mut_pos_cols$lab_fg # now clear mut_pos_cols rm(mut_pos_cols) ########################### !!! only for mcsm_lig # 4: Filter/subset data # Lig plots < 10Ang # Filter the lig plots for Dis_to_lig < 10Ang ########################### # check range of distances max(my_df$Dis_lig_Ang) min(my_df$Dis_lig_Ang) # subset data to have only values less than 10 Ang my_df2 = subset(my_df, my_df$Dis_lig_Ang < 10) # sanity checks table(my_df2$Dis_lig_Ang<10) table(my_df2$Dis_lig_Ang>10) max(my_df2$Dis_lig_Ang) min(my_df2$Dis_lig_Ang) # count no of unique positions length(unique(my_df2$Position)) #count no of unique mutations length(unique(my_df2$Mutationinformation)) # clear variable to avoid confusion rm(my_df) #%%%%%%%%%% # Reassign to keep code below consistent #%%%%%%%%%% my_df = my_df2 ########################### # 2: Plot: Lig scores ########################### #========================== # Plot 2: Barplot with scores (unordered) # corresponds to Lig_outcome # Stacked Barplot with colours: Lig_outcome @ position coloured by # stability scores. This is a barplot where each bar corresponds # to a SNP and is coloured by its corresponding PredAff stability value. # Normalised values (range between -1 and 1 ) to aid visualisation # NOTE: since barplot plots discrete values, colour = score, so number of # colours will be equal to the no. of unique normalised scores # rather than a continuous scale # will require generating the colour scale separately. #============================ # sanity checks upos = unique(my_df$Position) str(my_df$Lig_outcome) colnames(my_df) #=========================== # Data preparation for plots #=========================== #!!!!!!!!!!!!!!!!! # REASSIGNMENT df <- my_df rm(my_df) # sanity checks # should be a factor is.factor(df$Lig_outcome); as.factor(df$Lig_outcome) df$Lig_outcome = as.factor(df$Lig_outcome) is.factor(df$Lig_outcome); table(df$Lig_outcome) # should be -1 and 1 min(df$ratioPredAff) max(df$ratioPredAff) # sanity checks # very important!!!! tapply(df$ratioPredAff, df$Lig_outcome, min) tapply(df$ratioPredAff, df$Lig_outcome, max) # sanity checks # very important!!!! tapply(df$ratioPredAff, df$Lig_outcome, min) tapply(df$ratioPredAff, df$Lig_outcome, max) # check unique values in normalised data u = unique(df$ratioPredAff) # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # Run this section if rounding is to be used # specify number for rounding n = 3 df$ratioPredAffR = round(df$ratioPredAff, n) u = unique(df$ratioPredAffR) # create an extra column called group which contains the "gp name and score" # so colours can be generated for each unique values in this column my_grp = df$ratioPredAffR df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "") # Call the function to create the palette based on the group defined above colours <- ColourPalleteMulti(df, "Lig_outcome", "my_grp") my_title = "Protein stability (PredAff)" library(ggplot2) # axis label size my_xaxls = 13 my_title = "Ligand Affinity" # axis label size my_xaxls = 13 my_yaxls = 15 # axes text size my_xaxts = 15 my_yaxts = 15 # no ordering of x-axis according to frequency g = ggplot(df, aes(factor(Position, ordered = T))) g + geom_bar(aes(fill = group), colour = "grey") + scale_fill_manual( values = colours , guide = 'none') + theme( axis.text.x = element_text(size = my_xaxls , angle = 90 , hjust = 1 , vjust = 0.4) , axis.text.y = element_text(size = my_yaxls , angle = 0 , hjust = 1 , vjust = 0) , axis.title.x = element_text(size = my_xaxts) , axis.title.y = element_text(size = my_yaxts ) ) + labs(title = my_title , x = "Position" , y = "Frequency") class(df$lab_bg) # make this a named vector # define cartesian coord my_xlim = length(unique(df$Position)); my_xlim # axis label size my_xals = 15 my_yals = 15 # axes text size my_xats = 15 my_yats = 18 # using geom_tile g = ggplot(df, aes(factor(Position, ordered = T))) g + coord_cartesian(xlim = c(1, my_xlim) , ylim = c(0, 6) , clip = "off") + geom_bar(aes(fill = group), colour = "grey") + scale_fill_manual( values = colours , guide = 'none') + geom_tile(aes(,-0.8, width = 0.95, height = 0.85) , fill = df$lab_bg) + geom_tile(aes(,-1.2, width = 0.95, height = -0.2) , fill = df$lab_bg2) + # Here it's important to specify that your axis goes from 1 to max number of levels theme( axis.text.x = element_text(size = my_xats , angle = 90 , hjust = 1 , vjust = 0.4 , colour = my_axis_colours) , axis.text.y = element_text(size = my_yats , angle = 0 , hjust = 1 , vjust = 0) , axis.title.x = element_text(size = my_xals) , axis.title.y = element_text(size = my_yals ) , axis.ticks.x = element_blank() ) + labs(title = my_title , x = "Position" , y = "Frequency") #======================== # output plot as svg/png #======================== class(df$lab_bg) # make this a named vector # define cartesian coord my_xlim = length(unique(df$Position)); my_xlim # axis label size my_xals = 18 my_yals = 18 # axes text size my_xats = 14 my_yats = 18 # set output dir for plots #getwd() #setwd("~/git/Data/pyrazinamide/output/plots") #getwd() getwd() setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") getwd() ############################################################ # 1: Installing and loading required packages and functions ############################################################ source("../Header_TT.R") source("../barplot_colour_function.R") ############################################################ # Output dir for plots ############################################################ out_dir = "~/git/Data/pyrazinamide/output/plots" ############################################################ # 2: call script the prepares the data with columns containing # colours for axis labels ############################################################ source("subcols_axis.R") # this should return #mut_pos_cols: 130, 4 #my_df: 335, 39 # clear excess variable # "mut_pos_cols" is just for inspection in case you need to cross check # position numbers and colours # open file from deskptop ("sample_axis_cols") for cross checking table(mut_pos_cols$lab_bg) sum( table(mut_pos_cols$lab_bg) ) == nrow(mut_pos_cols) # should be True table(mut_pos_cols$lab_bg2) sum( table(mut_pos_cols$lab_bg2) ) == nrow(mut_pos_cols) # should be True table(mut_pos_cols$lab_fg) sum( table(mut_pos_cols$lab_fg) ) == nrow(mut_pos_cols) # should be True # very important! my_axis_colours = mut_pos_cols$lab_fg # now clear mut_pos_cols rm(mut_pos_cols) ########################### !!! only for mcsm_lig # 4: Filter/subset data # Lig plots < 10Ang # Filter the lig plots for Dis_to_lig < 10Ang ########################### # check range of distances max(my_df$Dis_lig_Ang) min(my_df$Dis_lig_Ang) # subset data to have only values less than 10 Ang my_df2 = subset(my_df, my_df$Dis_lig_Ang < 10) # sanity checks table(my_df2$Dis_lig_Ang<10) table(my_df2$Dis_lig_Ang>10) max(my_df2$Dis_lig_Ang) min(my_df2$Dis_lig_Ang) # count no of unique positions length(unique(my_df2$Position)) #count no of unique mutations length(unique(my_df2$Mutationinformation)) # clear variable to avoid confusion rm(my_df) #%%%%%%%%%% # Reassign to keep code below consistent #%%%%%%%%%% my_df = my_df2 ########################### # 2: Plot: Lig scores ########################### #========================== # Plot 2: Barplot with scores (unordered) # corresponds to Lig_outcome # Stacked Barplot with colours: Lig_outcome @ position coloured by # stability scores. This is a barplot where each bar corresponds # to a SNP and is coloured by its corresponding PredAff stability value. # Normalised values (range between -1 and 1 ) to aid visualisation # NOTE: since barplot plots discrete values, colour = score, so number of # colours will be equal to the no. of unique normalised scores # rather than a continuous scale # will require generating the colour scale separately. #============================ # sanity checks upos = unique(my_df$Position) str(my_df$Lig_outcome) colnames(my_df) #=========================== # Data preparation for plots #=========================== #!!!!!!!!!!!!!!!!! # REASSIGNMENT df <- my_df #!!!!!!!!!!!!!!!!! rm(my_df) # sanity checks # should be a factor is.factor(df$Lig_outcome); #FALSE df$Lig_outcome = as.factor(df$Lig_outcome) is.factor(df$Lig_outcome); #TRUE table(df$Lig_outcome) # check the range min(df$ratioPredAff) max(df$ratioPredAff) # sanity checks # very important!!!! tapply(df$ratioPredAff, df$Lig_outcome, min) tapply(df$ratioPredAff, df$Lig_outcome, max) # My colour FUNCTION: based on group and subgroup # in my case; # df = df # group = Lig_outcome # subgroup = normalised score i.e ratioPredAff # Prepare data: round off ratioPredAff scores # round off to 3 significant digits: # 323 if no rounding is performed: used to generate the original graph # 287 if rounded to 3 places # FIXME: check if reducing precicion creates any ML prob # check unique values in normalised data u = unique(df$ratioPredAff) # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # Run this section if rounding is to be used # specify number for rounding n = 3 df$ratioPredAffR = round(df$ratioPredAff, n) u = unique(df$ratioPredAffR) # create an extra column called group which contains the "gp name and score" # so colours can be generated for each unique values in this column my_grp = df$ratioPredAffR df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "") # ELSE # uncomment the below if rounding is not required #my_grp = df$ratioPredAff #df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "") # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% #****************** # generate plot #****************** # Call the function to create the palette based on the group defined above colours <- ColourPalleteMulti(df, "Lig_outcome", "my_grp") my_title = "Ligand Affinity" library(ggplot2) # axis label size my_xaxls = 13 my_yaxls = 15 # axes text size my_xaxts = 15 my_yaxts = 15 # no ordering of x-axis according to frequency g = ggplot(df, aes(factor(Position, ordered = T))) g + geom_bar(aes(fill = group), colour = "grey") + scale_fill_manual( values = colours , guide = 'none') + theme( axis.text.x = element_text(size = my_xaxls , angle = 90 , hjust = 1 , vjust = 0.4) , axis.text.y = element_text(size = my_yaxls , angle = 0 , hjust = 1 , vjust = 0) , axis.title.x = element_text(size = my_xaxts) , axis.title.y = element_text(size = my_yaxts ) ) + labs(title = my_title , x = "Position" , y = "Frequency") #======================== # plot with axis colours #======================== class(df$lab_bg) # make this a named vector # define cartesian coord my_xlim = length(unique(df$Position)); my_xlim # axis label size my_xals = 15 my_yals = 15 # axes text size my_xats = 15 my_yats = 18 # using geom_tile g = ggplot(df, aes(factor(Position, ordered = T))) g + coord_cartesian(xlim = c(1, my_xlim) , ylim = c(0, 6) , clip = "off") + geom_bar(aes(fill = group), colour = "grey") + scale_fill_manual( values = colours , guide = 'none') + geom_tile(aes(,-0.8, width = 0.95, height = 0.85) , fill = df$lab_bg) + geom_tile(aes(,-1.2, width = 0.95, height = -0.2) , fill = df$lab_bg2) + # Here it's important to specify that your axis goes from 1 to max number of levels theme( axis.text.x = element_text(size = my_xats , angle = 90 , hjust = 1 , vjust = 0.4 , colour = my_axis_colours) , axis.text.y = element_text(size = my_yats , angle = 0 , hjust = 1 , vjust = 0) , axis.title.x = element_text(size = my_xals) , axis.title.y = element_text(size = my_yals ) , axis.ticks.x = element_blank() ) + labs(title = my_title , x = "Position" , y = "Frequency") #======================== # output plot as svg/png #======================== class(df$lab_bg) # make this a named vector # define cartesian coord my_xlim = length(unique(df$Position)); my_xlim # axis label size my_xals = 18 my_yals = 18 # axes text size my_xats = 14 my_yats = 18 # set output dir for plots #getwd() #setwd("~/git/Data/pyrazinamide/output/plots") #getwd() plot_name = "barplot_LIG_acoloured.svg" my_plot_name = paste0(out_dir, "/", plot_name); my_plot_name svg(my_plot_name, width = 26, height = 4) g = ggplot(df, aes(factor(Position, ordered = T))) outFile = g + coord_cartesian(xlim = c(1, my_xlim) , ylim = c(0, 6) , clip = "off" ) + geom_bar(aes(fill = group), colour = "grey") + scale_fill_manual( values = colours , guide = 'none') + # geom_tile(aes(,-0.6, width = 0.9, height = 0.7) # , fill = df$lab_bg) + # geom_tile(aes(,-1, width = 0.9, height = 0.3) # , fill = df$lab_bg2) + geom_tile(aes(,-0.8, width = 0.95, height = 0.85) , fill = df$lab_bg) + geom_tile(aes(,-1.2, width = 0.95, height = -0.2) , fill = df$lab_bg2) + # Here it's important to specify that your axis goes from 1 to max number of levels theme( axis.text.x = element_text(size = my_xats , angle = 90 , hjust = 1 , vjust = 0.4 , colour = my_axis_colours) , axis.text.y = element_text(size = my_yats , angle = 0 , hjust = 1 , vjust = 0) , axis.title.x = element_text(size = my_xals) , axis.title.y = element_text(size = my_yals ) , axis.ticks.x = element_blank() ) + labs(title = "" , x = "Position" , y = "Frequency") print(outFile) dev.off() # for sanity and good practice #rm(df)