diff --git a/meta_data_analysis/combining_df_lig.R b/meta_data_analysis/combining_df_lig.R index 73afa6e..d9b6b9a 100644 --- a/meta_data_analysis/combining_df_lig.R +++ b/meta_data_analysis/combining_df_lig.R @@ -160,7 +160,6 @@ if (max(mcsm_data2$Dis_lig_Ang) < 10){ mcsm_data = mcsm_data2 #!!!!!!!!!!!!!!!!!!!!! #======================================================================= - # clear variables rm(mcsm_data2) @@ -173,13 +172,13 @@ head(mcsm_data$Mutationinformation) orig_col = ncol(mcsm_data) # get freq count of positions and add to the df -setDT(mcsm_data)[, occurrence := .N, by = .(Position)] +setDT(mcsm_data)[, mut_pos_occurrence := .N, by = .(position)] -cat('Added 1 col: position frequency to see which posn has how many muts' +cat('Added col: position frequency to see which posn has how many muts' , '\nNo. of cols now', ncol(mcsm_data) , '\nNo. of cols before: ', orig_col) -pos_count_check = data.frame(mcsm_data$Position, mcsm_data$occurrence) +mut_pos_occurrence = data.frame(mcsm_data$id, mcsm_data$Position, mcsm_data$mut_pos_occurrence) ###################################### # input file2 meta data with AFandOR @@ -205,8 +204,21 @@ str(meta_with_afor) head(meta_with_afor$Mutationinformation) meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),] head(meta_with_afor$Mutationinformation) + +orig_col2 = ncol(meta_with_afor) + +# get freq count of positions and add to the df +setDT(meta_with_afor)[, sample_pos_occurrence := .N, by = .(Position)] + +cat('Added col: position frequency of samples to check' + ,'how many samples correspond to a partiulcar posn associated with muts' + , '\nNo. of cols now', ncol(meta_with_afor) + , '\nNo. of cols before: ', orig_col2) + +sample_pos_occurrence = data.frame(meta_with_afor$id, meta_with_afor$position, meta_with_afor$sample_pos_occurrence) + #======================================================================= -cat('Begin merging dfs with NAs', +cat('Begin merging dfs with NAs' , '\n===============================================================') ########################### @@ -315,7 +327,7 @@ if (identical(sum(is.na(merged_df3$OR)) #======================================================================= #%% merging without NAs -cat('Begin merging dfs without NAs', +cat('Begin merging dfs without NAs' , '\n===============================================================') cat('Merging dfs without any NAs: big df (1-many relationship b/w id & mut)' @@ -378,7 +390,7 @@ if(nrow(merged_df3_comp) == nrow(merged_df3)){ #************************* # clear variables rm(mcsm_data, meta_with_afor, foo, drug, gene, gene_match, indir, merged_muts_u, meta_muts_u, na_count, orig_col, outdir) -rm(pos_count_check) +rm(mut_pos_occurrence) #%% end of script #======================================================================= diff --git a/meta_data_analysis/combining_df_ps.R b/meta_data_analysis/combining_df_ps.R index 212a36b..770661d 100644 --- a/meta_data_analysis/combining_df_ps.R +++ b/meta_data_analysis/combining_df_ps.R @@ -155,14 +155,17 @@ head(mcsm_data$Mutationinformation) orig_col = ncol(mcsm_data) # get freq count of positions and add to the df -setDT(mcsm_data)[, occurrence := .N, by = .(Position)] +setDT(mcsm_data)[, mut_pos_occurrence := .N, by = .(Position)] -cat('Added 1 col: position frequency to see which posn has how many muts' +cat('Added col: position frequency of muts to see which posn has how many muts' , '\nNo. of cols now', ncol(mcsm_data) , '\nNo. of cols before: ', orig_col) -pos_count_check = data.frame(mcsm_data$Position, mcsm_data$occurrence) +mut_pos_occurrence = data.frame(mcsm_data$Mutationinformation + , mcsm_data$Position + , mcsm_data$mut_pos_occurrence) +colnames(mut_pos_occurrence) = c('Mutationinformation', 'position', 'mut_pos_occurrence') ####################################### # input file 2: meta data with AFandOR ####################################### @@ -201,8 +204,25 @@ str(meta_with_afor) head(meta_with_afor$Mutationinformation) meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),] head(meta_with_afor$Mutationinformation) + +orig_col2 = ncol(meta_with_afor) + +# get freq count of positions and add to the df +setDT(meta_with_afor)[, sample_pos_occurrence := .N, by = .(position)] + +cat('Added col: position frequency of samples to check' + ,'how many samples correspond to a partiulcar posn associated with muts' + , '\nNo. of cols now', ncol(meta_with_afor) + , '\nNo. of cols before: ', orig_col2) + +sample_pos_occurrence = data.frame(meta_with_afor$id + , meta_with_afor$mutation + , meta_with_afor$Mutationinformation + , meta_with_afor$position + , meta_with_afor$sample_pos_occurrence) +colnames(sample_pos_occurrence) = c('id', 'mutation', 'Mutationinformation', 'position', 'sample_pos_occurrence') #======================================================================= -cat('Begin merging dfs with NAs', +cat('Begin merging dfs with NAs' , '\n===============================================================') ########################### @@ -313,7 +333,7 @@ if (identical(sum(is.na(merged_df3$OR)) #======================================================================= #%% merging without NAs -cat('Begin merging dfs without NAs', +cat('Begin merging dfs without NAs' , '\n===============================================================') cat('Merging dfs without any NAs: big df (1-many relationship b/w id & mut)' @@ -414,10 +434,28 @@ for (i in outvars){ # alternate way to replace with implicit loop # FIXME #sapply(outvars, function(x, y) write.csv(get(outvars), paste0(outdir, '/', outvars, '.csv'))) + +#======================================================================= +#%% merging mut_pos_occurrence and sample_pos_occurence +# FIXME +#cat('Merging dfs with positional frequency from mcsm and meta_data' +# , '\nNcol in mut_pos_occurrence:', ncol(mut_pos_occurrence) +# , '\nncol in sample_pos_occurence:', ncol(sample_pos_occurrence) +# ,'\nlinking col:', intersect(colnames(sample_pos_occurrence), colnames(mut_pos_occurrence)) +# ,'\nfilename: merged_df4') + +#merged_df4 = merge(sample_pos_occurrence, mut_pos_occurrence +# , by = 'position' +# , all = T) + +#out_filename4 = 'mut_and_sample_freq.csv' +#outfile4 = paste0(outdir, '/', out_filename4) + #************************* # clear variables rm(mcsm_data, meta_with_afor, foo, drug, gene, gene_match, indir, merged_muts_u, meta_muts_u, na_count, orig_col, outdir) -rm(pos_count_check) +rm(mut_pos_occurrence, sample_pos_occurrence) +#rm(merged_df4) #%% end of script #======================================================================= diff --git a/plotting_test/logo_plot.R b/plotting_test/logo_plot.R new file mode 100644 index 0000000..be1d9d7 --- /dev/null +++ b/plotting_test/logo_plot.R @@ -0,0 +1,221 @@ +#======================================================================= +# Task: To generate a logo plot or bar plot but coloured +# aa properties. +# step1: read mcsm file and OR file +# step2: plot wild type positions +# step3: plot mutants per position coloured by aa properties +# step4: make the size of the letters/bars prop to OR if you can! + +# useful links +# https://stackoverflow.com/questions/5438474/plotting-a-sequence-logo-using-ggplot2 +# https://omarwagih.github.io/ggseqlogo/ +# https://kkdey.github.io/Logolas-pages/workflow.html +# A new sequence logo plot to highlight enrichment and depletion. +# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6288878/ + +#very good: http://www.cbs.dtu.dk/biotools/Seq2Logo-2.0/ +#======================================================================= +#%% specify curr dir +getwd() +setwd('~/git/LSHTM_analysis/plotting_test/') +getwd() +#======================================================================= +#%% load packages + +# header file +header_dir = '~/git/LSHTM_analysis/' +source(paste0(header_dir, '/', 'my_header.R')) +#======================================================================= +#%% variable assignment: input and output paths & filenames +drug = 'pyrazinamide' +gene = 'pncA' +gene_match = paste0(gene,'_p.') +cat(gene_match) + +#=========== +# data dir +#=========== +datadir = paste0('~/git/Data') + +#=========== +# input +#=========== +# source R script 'combining_two_df.R' +#indir = paste0(datadir, '/', drug, '/', 'output') # reading files +indir = '../meta_data_analysis' # sourcing R script +in_filename = 'combining_df_ps.R' +infile = paste0(indir, '/', in_filename) +cat(paste0('Input is a R script: ', '\'', infile, '\'') + , '\n========================================================') + +#=========== +# output +#=========== +# 1) lineage dist of all muts +outdir = paste0('~/git/Data', '/', drug, '/', 'output/plots') #same as indir +#cat('Output dir: ', outdir, '\n') +#file_type = '.svg' +#out_filename1 = paste0(tolower(gene), '_lineage_dist_ps', file_type) +#outfile1 = paste0(outdir, '/', out_filename1) +#cat(paste0('Output plot1 :', outfile1) +# , '\n========================================================') + +#%% end of variable assignment for input and output files +#======================================================================= +##%% read input file +cat('Reading input file(sourcing R script):', in_filename) + +source(infile) + +#========================== +# This will return: + +# df with NA for pyrazinamide: +# merged_df2 +# merged_df3 + +# df without NA for pyrazinamide: +# merged_df2_comp +# merged_df3_comp +#=========================== + +########################### +# Data for plots +# you need merged_df2 or merged_df2_comp +# since this is one-many relationship +# i.e the same SNP can belong to multiple lineages +# using the _comp dataset means +# we lose some muts and at this level, we should use +# as much info as available, hence use df with NA + +# This will the first plotting df +# Then subset this to extract dr muts only (second plottig df) +########################### + +#%%%%%%%%%%%%%%%%%%%%%%%%% +# uncomment as necessary +# REASSIGNMENT +#my_data = merged_df2 +#my_data = merged_df2_comp +#my_data = merged_df3 +my_data = merged_df3_comp +#%%%%%%%%%%%%%%%%%%%%%%%%%% + +# delete variables not required +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) + +# quick checks +colnames(my_data) +str(my_data) + +c1 = unique(my_data$Position) +nrow(my_data) +cat('No. of rows in my_data:', nrow(my_data) + , '\nDistinct positions corresponding to snps:', length(c1) + , '\n===========================================================') + +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +# FIXME: Think and decide what you want to remove +# mut_pos_occurence < 1 or sample_pos_occurrence <1 + +# get freq count of positions so you can subset freq<1 +require(data.table) +#setDT(my_data)[, mut_pos_occurrence := .N, by = .(Position)] #265, 14 + +#extract freq_pos>1 +#my_data_snp = my_data[my_data$occurrence!=1,] + +#u = unique(my_data_snp$Position) #73 +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# REASSIGNMENT to prevent changing code +my_data_snp = my_data +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +#======================================================================= +#%% logo plots from dataframe + +############# +# PLOTS: ggseqlogo with custom height +# https://omarwagih.github.io/ggseqlogo/ +############# +#require(ggplot2) +#require(tidyverse) +library(ggseqlogo) + +foo = my_data_snp[, c("Position", "Mutant_type","ratioDUET", "OR" + , "mut_prop_polarity", "mut_prop_water") ] + +# log10OR +# FIXME: at the source script (when calculating AFandOR) +my_data_snp$log10or = log10(my_data_snp$OR) +bar = my_data_snp[, c('Position', 'Mutant_type', 'OR', 'logor', 'log10or')] + + +bar_or = my_data_snp[, c('Position', 'Mutant_type', 'OR')] +wide_df_or <- bar_or %>% spread(Position, OR, fill = 0) +wide_df_or = as.matrix(wide_df_or) +rownames(wide_df_or) = wide_df_or[,1] +wide_df_or = wide_df_or[,-1] + +# custom height (OR) logo plot: yayy works +ggseqlogo(wide_df_or, method='custom', seq_type='aa') + ylab('my custom height') + + theme(legend.position = "bottom" + , axis.text.x = element_text(size = 11 + , angle = 90 + , hjust = 1 + , vjust = 0.4) + , axis.text.y = element_text(size = 15 + , angle = 0 + , hjust = 1 + , vjust = 0))+ + labs(title = "AA logo plot" + , x = "Wild-type Position" + , y = "OR") +#%% end of logo plot with OR as height +#======================================================================= +# extracting data with log10OR +bar_logor = my_data_snp[, c('Position', 'Mutant_type', 'log10or')] +wide_df_logor <- bar_logor %>% spread(Position, log10or, fill = 0) + +wide_df_logor = as.matrix(wide_df_logor) +rownames(wide_df_logor) = wide_df_logor[,1] +wide_df_logor = wide_df_logor[,-1] + +# custom height (log10OR) logo plot: yayy works +ggseqlogo(wide_df_logor, method='custom', seq_type='aa') + ylab('my custom height') + + theme(legend.position = "bottom" + , axis.text.x = element_text(size = 11 + , angle = 90 + , hjust = 1 + , vjust = 0.4) + , axis.text.y = element_text(size = 15 + , angle = 0 + , hjust = 1 + , vjust = 0))+ + labs(title = "AA logo plot" + , x = "Wild-type Position" + , y = "Log10(OR)") + +#======================================================================= +#%% logo plot from sequence + +################# +# Plot: LOGOLAS (ED plots) +# link: https://github.com/kkdey/Logolas +# on all pncA samples: output of mutate.py +################ +library(Logolas) + +seqs = read.csv('~/git//Data/pyrazinamide/output/pnca_msa.txt' + , header = FALSE + , stringsAsFactors = FALSE)$V1 + + +# my_data: useful! +logomaker(seqs, type = "EDLogo", color_type = 'per_symbol' + , return_heights = TRUE) +logomaker(seqs, type = "Logo", color_type = 'per_symbol') + +#%% end of script +#======================================================================= \ No newline at end of file