diff --git a/meta_data_analysis/combining_df_lig.R b/meta_data_analysis/combining_df_lig.R new file mode 100644 index 0000000..73afa6e --- /dev/null +++ b/meta_data_analysis/combining_df_lig.R @@ -0,0 +1,385 @@ +#======================================================================= +# TASK: To combine mcsm and meta data with af and or +# by filtering for distance to ligand (<10Ang). +# This script doesn't output anything. +# This script is sourced from other .R scripts for plotting ligand plots + +# Input csv files: +# 1) mcsm normalised and struct params +# 2) gene associated meta_data_with_AFandOR +#======================================================================= +#%% specify curr dir +getwd() +setwd('~/git/LSHTM_analysis/meta_data_analysis/') +getwd() +#======================================================================= +#%% load packages +#require(data.table) +#require(arsenal) +#require(compare) +#library(tidyverse) + +# header file +header_dir = '~/git/LSHTM_analysis/' +source(paste0(header_dir, '/', 'my_header.R')) +#%% variable assignment: input and output paths & filenames +#======================================================================= +drug = 'pyrazinamide' +gene = 'pncA' +gene_match = paste0(gene,'_p.') +cat(gene_match) + +#=========== +# data dir +#=========== +datadir = '~/git/Data' + +#=========== +# input +#=========== +# infile1: mCSM data +#indir = '~/git/Data/pyrazinamide/input/processed/' +indir = paste0(datadir, '/', drug, '/', 'output') # revised {TODO: change in mcsm pipeline} +#in_filename = 'mcsm_complex1_normalised.csv' +in_filename = 'pnca_mcsm_struct_params.csv' +infile = paste0(indir, '/', in_filename) +cat(paste0('Reading infile1: mCSM output file', ' ', infile) ) + +# infile2: gene associated meta data combined with AF and OR +#indir: same as above +in_filename_comb = paste0(tolower(gene), '_meta_data_with_AFandOR.csv') +infile_comb = paste0(indir, '/', in_filename_comb) +cat(paste0('Reading infile2: gene associated combined metadata:', infile_comb)) + +#=========== +# output +#=========== +# Uncomment if and when required to output +outdir = paste0(datadir, '/', drug, '/', 'output') #same as indir +cat('Output dir: ', outdir, '\n') +#out_filename = paste0(tolower(gene), 'XXX') +#outfile = paste0(outdir, '/', out_filename) +#cat(paste0('Output file with full path:', outfile)) + +#%% end of variable assignment for input and output files +#======================================================================= +#%% Read input files + +##################################### +# input file 1: mcsm normalised data +# output of step 4 mcsm_pipeline +##################################### + +cat('Reading mcsm_data:' + , '\nindir: ', indir + , '\ninfile_comb: ', in_filename) + +mcsm_data = read.csv(infile +# , row.names = 1 + , stringsAsFactors = F + , header = T) + +cat('Read mcsm_data file:' + , '\nNo.of rows: ', nrow(mcsm_data) + , '\nNo. of cols:', ncol(mcsm_data)) + +# clear variables +rm(in_filename, infile) + +str(mcsm_data) + +table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) ) + +# spelling Correction 1: DUET +mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising' +mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising' + +# checks: should be the same as above +table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) ) +head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome) + +# spelling Correction 2: Ligand +table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) ) + +mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising' +mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising' + +# checks: should be the same as above +table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) ) +head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome) + +# muts with opposing effects on protomer and ligand stability +# excluded from here as it is redundant. +# check 'combining_two_df.R' to refer if required. +#======================================================================= +#%% !!!Filter data only for mcsm lig!!! + +########################### +# Filter/subset data +# Lig plots < 10Ang +# Filter the lig plots for Dis_to_lig < 10Ang +########################### + +# check range of distances +max(mcsm_data$Dis_lig_Ang) +min(mcsm_data$Dis_lig_Ang) + +# count +table(mcsm_data$Dis_lig_Ang<10) + +# subset data to have only values less than 10 Ang +mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10) + +# sanity checks +max(mcsm_data2$Dis_lig_Ang) +min(mcsm_data2$Dis_lig_Ang) + +# count no of unique positions +length(unique(mcsm_data2$Position)) + +# count no of unique mutations +length(unique(mcsm_data2$Mutationinformation)) + +# count Destabilisinga and stabilising +table(mcsm_data2$Lig_outcome) #{RESULT: no of mutations within 10Ang} + +############################# +# Extra sanity check: +# for mcsm_lig ONLY +# Dis_lig_Ang should be <10 +############################# + +if (max(mcsm_data2$Dis_lig_Ang) < 10){ + print ("Sanity check passed: lig data is <10Ang") +}else{ + print ("Error: data should be filtered to be within 10Ang") +} + +#!!!!!!!!!!!!!!!!!!!!! +# REASSIGNMENT: so as not to alter the script +mcsm_data = mcsm_data2 +#!!!!!!!!!!!!!!!!!!!!! +#======================================================================= + +# clear variables +rm(mcsm_data2) + +# count na in each column +na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count + +# sort by Mutationinformation +mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),] +head(mcsm_data$Mutationinformation) + +orig_col = ncol(mcsm_data) +# get freq count of positions and add to the df +setDT(mcsm_data)[, occurrence := .N, by = .(Position)] + +cat('Added 1 col: position frequency to see which posn has how many muts' + , '\nNo. of cols now', ncol(mcsm_data) + , '\nNo. of cols before: ', orig_col) + +pos_count_check = data.frame(mcsm_data$Position, mcsm_data$occurrence) + +###################################### +# input file2 meta data with AFandOR +###################################### +cat('Reading combined meta data and AFandOR file:' + , '\nindir: ', indir + , '\ninfile_comb: ', in_filename_comb) + +meta_with_afor <- read.csv(infile_comb + , stringsAsFactors = F + , header = T) + +cat('Read mcsm_data file:' + , '\nNo.of rows: ', nrow(meta_with_afor) + , '\nNo. of cols:', ncol(meta_with_afor)) + +# clear variables +rm(in_filename_comb, infile_comb) + +str(meta_with_afor) + +# sort by Mutationinformation +head(meta_with_afor$Mutationinformation) +meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),] +head(meta_with_afor$Mutationinformation) +#======================================================================= +cat('Begin merging dfs with NAs', + , '\n===============================================================') + +########################### +# merging two dfs: with NA +########################### +# link col name = 'Mutationinforamtion' +cat('Merging dfs with NAs: big df (1-many relationship b/w id & mut)' + ,'\nlinking col: Mutationinforamtion' + ,'\nfilename: merged_df2') + +head(mcsm_data$Mutationinformation) +head(meta_with_afor$Mutationinformation) + +######### +# a) merged_df2: meta data with mcsm +######### +merged_df2 = merge(x = meta_with_afor + , y = mcsm_data + , by = 'Mutationinformation' + , all.y = T) + +cat('Dim of merged_df2: ' + , '\nNo. of rows: ', nrow(merged_df2) + , '\nNo. of cols: ', ncol(merged_df2)) +head(merged_df2$Position) + +if(nrow(meta_with_afor) == nrow(merged_df2)){ + cat('nrow(merged_df2) = nrow (gene associated metadata)' + ,'\nExpected no. of rows: ',nrow(meta_with_afor) + ,'\nGot no. of rows: ', nrow(merged_df2)) +} else{ + cat('nrow(merged_df2)!= nrow(gene associated metadata)' + , '\nExpected no. of rows after merge: ', nrow(meta_with_afor) + , '\nGot no. of rows: ', nrow(merged_df2) + , '\nFinding discrepancy') + merged_muts_u = unique(merged_df2$Mutationinformation) + meta_muts_u = unique(meta_with_afor$Mutationinformation) + # find the index where it differs + unique(meta_muts_u[! meta_muts_u %in% merged_muts_u]) +} + +# sort by Position +head(merged_df2$Position) +merged_df2 = merged_df2[order(merged_df2$Position),] +head(merged_df2$Position) + +merged_df2v2 = merge(x = meta_with_afor + , y = mcsm_data + , by = 'Mutationinformation' + , all.x = T) +#!=!=!=!=!=!=!=! +# COMMENT: used all.y since position 186 is not part of the struc, +# hence doesn't have a mcsm value +# but 186 is associated with mutation +#!=!=!=!=!=!=!=! + +# should be False +identical(merged_df2, merged_df2v2) +table(merged_df2$Position%in%merged_df2v2$Position) + +rm(merged_df2v2) + +######### +# b) merged_df3: remove duplicate mutation information +######### +cat('Merging dfs with NAs: small df (removing duplicate muts)' + ,'\nCannot trust lineage info from this' + ,'\nlinking col: Mutationinforamtion' + ,'\nfilename: merged_df3') + +#==#=#=#=#=#=# +# Cannot trust lineage, country from this df as the same mutation +# can have many different lineages +# but this should be good for the numerical corr plots +#=#=#=#=#=#=#= +merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),] +head(merged_df3$Position); tail(merged_df3$Position) # should be sorted + +# sanity checks +# nrows of merged_df3 should be the same as the nrows of mcsm_data +if(nrow(mcsm_data) == nrow(merged_df3)){ + cat('PASS: No. of rows match with mcsm_data' + ,'\nExpected no. of rows: ', nrow(mcsm_data) + ,'\nGot no. of rows: ', nrow(merged_df3)) +} else { + cat('FAIL: No. of rows mismatch' + , '\nNo. of rows mcsm_data: ', nrow(mcsm_data) + , '\nNo. of rows merged_df3: ', nrow(merged_df3)) +} + +# counting NAs in AF, OR cols in merged_df3 +# this is becuase mcsm has no AF, OR cols, +# so you cannot count NAs +if (identical(sum(is.na(merged_df3$OR)) + , sum(is.na(merged_df3$pvalue)) + , sum(is.na(merged_df3$AF)))){ + cat('PASS: NA count match for OR, pvalue and AF\n') + na_count_df3 = sum(is.na(merged_df3$AF)) + cat('No. of NAs: ', sum(is.na(merged_df3$OR))) +} else{ + cat('FAIL: NA count mismatch' + , '\nNA in OR: ', sum(is.na(merged_df3$OR)) + , '\nNA in pvalue: ', sum(is.na(merged_df3$pvalue)) + , '\nNA in AF:', sum(is.na(merged_df3$AF))) +} +#======================================================================= +#%% merging without NAs + +cat('Begin merging dfs without NAs', + , '\n===============================================================') + +cat('Merging dfs without any NAs: big df (1-many relationship b/w id & mut)' + ,'\nlinking col: Mutationinforamtion' + ,'\nfilename: merged_df2_comp') + +######### +# c) merged_df2_comp: merging two dfs without NA +######### +merged_df2_comp = merged_df2[!is.na(merged_df2$AF),] +#merged_df2_comp = merged_df2[!duplicated(merged_df2$Mutationinformation),] + +# sanity check +cat('Checking nrows in merged_df2_comp') +if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count + 1)){ + cat('PASS: No. of rows match' + ,'\nDim of merged_df2_comp: ' + ,'\nExpected no. of rows: ', nrow(merged_df2) - na_count + 1 + , '\nNo. of rows: ', nrow(merged_df2_comp) + , '\nNo. of cols: ', ncol(merged_df2_comp)) +}else{ + cat('FAIL: No. of rows mismatch' + ,'\nExpected no. of rows: ', nrow(merged_df2) - na_count + 1 + ,'\nGot no. of rows: ', nrow(merged_df2_comp)) +} + +######### +# d) merged_df3_comp: remove duplicate mutation information +######### +merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),] + +cat('Dim of merged_df3_comp: ' + , '\nNo. of rows: ', nrow(merged_df3_comp) + , '\nNo. of cols: ', ncol(merged_df3_comp)) + +# alternate way of deriving merged_df3_comp +foo = merged_df3[!is.na(merged_df3$AF),] +# compare dfs: foo and merged_df3_com +all.equal(foo, merged_df3) + +summary(comparedf(foo, merged_df3)) + +# sanity check +cat('Checking nrows in merged_df3_comp') +if(nrow(merged_df3_comp) == nrow(merged_df3)){ + cat('NO NAs detected in merged_df3 in AF|OR cols' + ,'\nNo. of rows are identical: ', nrow(merged_df3)) +} else{ + if(nrow(merged_df3_comp) == nrow(merged_df3) - na_count_df3) { + cat('PASS: NAs detected in merged_df3 in AF|OR cols' + , '\nNo. of NAs: ', na_count_df3 + , '\nExpected no. of rows in merged_df3_comp: ', nrow(merged_df3) - na_count_df3 + , '\nGot no. of rows: ', nrow(merged_df3_comp)) + } +} + +#======================================================================= +# write_output files +# Not required as this is a subset of the combining_two_df.R +#************************* +# clear variables +rm(mcsm_data, meta_with_afor, foo, drug, gene, gene_match, indir, merged_muts_u, meta_muts_u, na_count, orig_col, outdir) +rm(pos_count_check) +#%% end of script +#======================================================================= + + diff --git a/meta_data_analysis/combining_df_ps.R b/meta_data_analysis/combining_df_ps.R new file mode 100644 index 0000000..212a36b --- /dev/null +++ b/meta_data_analysis/combining_df_ps.R @@ -0,0 +1,423 @@ +#======================================================================= +# TASK: To combine mcsm and meta data with af and or files +# Input csv files: +# 1) mcsm normalised and struct params +# 2) gene associated meta_data_with_AFandOR + +# Output: +# 1) muts with opposite effects on stability +# 2) large combined df including NAs for AF, OR,etc +# Dim: same no. of rows as gene associated meta_data_with_AFandOR +# 3) small combined df including NAs for AF, OR, etc. +# Dim: same as mcsm data +# 4) large combined df excluding NAs +# Dim: dim(#1) - no. of NAs(AF|OR) + 1 +# 5) small combined df excluding NAs +# Dim: dim(#2) - no. of unique NAs - 1 +# This script is sourced from other .R scripts for plotting +#======================================================================= +#%% specify curr dir +getwd() +setwd('~/git/LSHTM_analysis/meta_data_analysis/') +getwd() +#======================================================================= +#%% load packages +#require(data.table) +#require(arsenal) +#require(compare) +#library(tidyverse) + +# header file +header_dir = '~/git/LSHTM_analysis/' +source(paste0(header_dir, '/', 'my_header.R')) +#======================================================================= +#%% variable assignment: input and output paths & filenames +drug = 'pyrazinamide' +gene = 'pncA' +gene_match = paste0(gene,'_p.') +cat(gene_match) + +#=========== +# data dir +#=========== +datadir = '~/git/Data' + +#=========== +# input +#=========== +# infile1: mCSM data +#indir = '~/git/Data/pyrazinamide/input/processed/' +indir = paste0(datadir, '/', drug, '/', 'output') # revised {TODO: change in mcsm pipeline} +#in_filename = 'mcsm_complex1_normalised.csv' +in_filename = 'pnca_mcsm_struct_params.csv' +infile = paste0(indir, '/', in_filename) +cat(paste0('Reading infile1: mCSM output file', ' ', infile, '\n') ) + +# infile2: gene associated meta data combined with AF and OR +#indir: same as above +in_filename_comb = paste0(tolower(gene), '_meta_data_with_AFandOR.csv') +infile_comb = paste0(indir, '/', in_filename_comb) +cat(paste0('Reading infile2: gene associated combined metadata:', infile_comb, '\n')) + +#=========== +# output +#=========== +# Uncomment if and when required to output +outdir = paste0(datadir, '/', drug, '/', 'output') #same as indir +cat('Output dir: ', outdir, '\n') +#out_filename = paste0(tolower(gene), 'XXX') +#outfile = paste0(outdir, '/', out_filename) +#cat(paste0('Output file with full path:', outfile)) + +#%% end of variable assignment for input and output files +#======================================================================= +#%% Read input files + +##################################### +# input file 1: mcsm normalised data +# output of step 4 mcsm_pipeline +##################################### +cat('Reading mcsm_data:' + , '\nindir: ', indir + , '\ninfile_comb: ', in_filename) + +mcsm_data = read.csv(infile +# , row.names = 1 + , stringsAsFactors = F + , header = T) + +cat('Read mcsm_data file:' + , '\nNo.of rows: ', nrow(mcsm_data) + , '\nNo. of cols:', ncol(mcsm_data)) + +# clear variables +rm(in_filename, infile) + +str(mcsm_data) + +table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) ) + +# spelling Correction 1: DUET +mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising' +mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising' + +# checks: should be the same as above +table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) ) +head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome) + +# spelling Correction 2: Ligand +table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) ) + +mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising' +mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising' + +# checks: should be the same as above +table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) ) +head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome) + +# muts with opposing effects on protomer and ligand stability +table(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome) +changes = mcsm_data[which(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome),] + +# sanity check: redundant, but uber cautious! +dl_i = which(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome) +ld_i = which(mcsm_data$Lig_outcome != mcsm_data$DUET_outcome) + +cat('Identifying muts with opposite stability effects') +if(nrow(changes) == (table(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome)[[2]]) & identical(dl_i,ld_i)) { + cat('PASS: muts with opposite effects on stability and affinity correctly identified' + , '\nNo. of such muts: ', nrow(changes)) +}else { + cat('FAIL: unsuccessful in extracting muts with changed stability effects') +} + +#*************************** +# write file: changed muts +out_filename = 'muts_opp_effects.csv' +outfile = paste0(outdir, '/', out_filename) +cat('Writing file for muts with opp effects:' + , '\nFilename: ', outfile + , '\nPath: ', outdir) + +write.csv(changes, outfile) +#**************************** +# clear variables +rm(out_filename, outfile) +rm(changes, dl_i, ld_i) + +# count na in each column +na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count + +# sort by Mutationinformation +mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),] +head(mcsm_data$Mutationinformation) + +orig_col = ncol(mcsm_data) + +# get freq count of positions and add to the df +setDT(mcsm_data)[, occurrence := .N, by = .(Position)] + +cat('Added 1 col: position frequency to see which posn has how many muts' + , '\nNo. of cols now', ncol(mcsm_data) + , '\nNo. of cols before: ', orig_col) + +pos_count_check = data.frame(mcsm_data$Position, mcsm_data$occurrence) + +####################################### +# input file 2: meta data with AFandOR +####################################### +cat('Reading combined meta data and AFandOR file:' + , '\nindir: ', indir + , '\ninfile_comb: ', in_filename_comb) + +meta_with_afor <- read.csv(infile_comb + , stringsAsFactors = F + , header = T) + +cat('Read mcsm_data file:' + , '\nNo.of rows: ', nrow(meta_with_afor) + , '\nNo. of cols:', ncol(meta_with_afor)) + +# counting NAs in AF, OR cols +if (identical(sum(is.na(meta_with_afor$OR)) + , sum(is.na(meta_with_afor$pvalue)) + , sum(is.na(meta_with_afor$AF)))){ + cat('PASS: NA count match for OR, pvalue and AF\n') + na_count = sum(is.na(meta_with_afor$AF)) + cat('No. of NAs: ', sum(is.na(meta_with_afor$OR))) +} else{ + cat('FAIL: NA count mismatch' + , '\nNA in OR: ', sum(is.na(meta_with_afor$OR)) + , '\nNA in pvalue: ', sum(is.na(meta_with_afor$pvalue)) + , '\nNA in AF:', sum(is.na(meta_with_afor$AF))) +} + +# clear variables +rm(in_filename_comb, infile_comb) + +str(meta_with_afor) + +# sort by Mutationinformation +head(meta_with_afor$Mutationinformation) +meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),] +head(meta_with_afor$Mutationinformation) +#======================================================================= +cat('Begin merging dfs with NAs', + , '\n===============================================================') + +########################### +# merging two dfs: with NA +########################### +# link col name = 'Mutationinforamtion' +head(mcsm_data$Mutationinformation) +head(meta_with_afor$Mutationinformation) + +cat('Merging dfs with NAs: big df (1-many relationship b/w id & mut)' + ,'\nlinking col: Mutationinforamtion' + ,'\nfilename: merged_df2') + +######### +# a) merged_df2: meta data with mcsm +######### +merged_df2 = merge(x = meta_with_afor + ,y = mcsm_data + , by = 'Mutationinformation' + , all.y = T) + +cat('Dim of merged_df2: ' + , '\nNo. of rows: ', nrow(merged_df2) + , '\nNo. of cols: ', ncol(merged_df2)) +head(merged_df2$Position) + +# sanity check +cat('Checking nrows in merged_df2') +if(nrow(meta_with_afor) == nrow(merged_df2)){ + cat('nrow(merged_df2) = nrow (gene associated metadata)' + ,'\nExpected no. of rows: ',nrow(meta_with_afor) + ,'\nGot no. of rows: ', nrow(merged_df2)) +} else{ + cat('nrow(merged_df2)!= nrow(gene associated metadata)' + , '\nExpected no. of rows after merge: ', nrow(meta_with_afor) + , '\nGot no. of rows: ', nrow(merged_df2) + , '\nFinding discrepancy') + merged_muts_u = unique(merged_df2$Mutationinformation) + meta_muts_u = unique(meta_with_afor$Mutationinformation) + # find the index where it differs + unique(meta_muts_u[! meta_muts_u %in% merged_muts_u]) +} + +# sort by Position +head(merged_df2$Position) +merged_df2 = merged_df2[order(merged_df2$Position),] +head(merged_df2$Position) + +merged_df2v2 = merge(x = meta_with_afor + ,y = mcsm_data + , by = 'Mutationinformation' + , all.x = T) +#!=!=!=!=!=!=!=! +# COMMENT: used all.y since position 186 is not part of the struc, +# hence doesn't have a mcsm value +# but 186 is associated with mutation +#!=!=!=!=!=!=!=! + +# should be False +identical(merged_df2, merged_df2v2) +table(merged_df2$Position%in%merged_df2v2$Position) + +rm(merged_df2v2) + +######### +# b) merged_df3:remove duplicate mutation information +######### +cat('Merging dfs without NAs: small df (removing muts with no AF|OR associated)' + ,'\nCannot trust lineage info from this' + ,'\nlinking col: Mutationinforamtion' + ,'\nfilename: merged_df3') + +#==#=#=#=#=#=# +# Cannot trust lineage, country from this df as the same mutation +# can have many different lineages +# but this should be good for the numerical corr plots +#=#=#=#=#=#=#= +merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),] +head(merged_df3$Position); tail(merged_df3$Position) # should be sorted + +# sanity check +cat('Checking nrows in merged_df3') +if(nrow(mcsm_data) == nrow(merged_df3)){ + cat('PASS: No. of rows match with mcsm_data' + ,'\nExpected no. of rows: ', nrow(mcsm_data) + ,'\nGot no. of rows: ', nrow(merged_df3)) +} else { + cat('FAIL: No. of rows mismatch' + , '\nNo. of rows mcsm_data: ', nrow(mcsm_data) + , '\nNo. of rows merged_df3: ', nrow(merged_df3)) +} + +# counting NAs in AF, OR cols in merged_df3 +# this is becuase mcsm has no AF, OR cols, +# so you cannot count NAs +if (identical(sum(is.na(merged_df3$OR)) + , sum(is.na(merged_df3$pvalue)) + , sum(is.na(merged_df3$AF)))){ + cat('PASS: NA count match for OR, pvalue and AF\n') + na_count_df3 = sum(is.na(merged_df3$AF)) + cat('No. of NAs: ', sum(is.na(merged_df3$OR))) +} else{ + cat('FAIL: NA count mismatch' + , '\nNA in OR: ', sum(is.na(merged_df3$OR)) + , '\nNA in pvalue: ', sum(is.na(merged_df3$pvalue)) + , '\nNA in AF:', sum(is.na(merged_df3$AF))) +} +#======================================================================= +#%% merging without NAs + +cat('Begin merging dfs without NAs', + , '\n===============================================================') + +cat('Merging dfs without any NAs: big df (1-many relationship b/w id & mut)' + ,'\nlinking col: Mutationinforamtion' + ,'\nfilename: merged_df2_comp') + +######### +# c) merged_df2_comp: same as merge 1 but excluding NA +######### +merged_df2_comp = merged_df2[!is.na(merged_df2$AF),] +#merged_df2_comp = merged_df2[!duplicated(merged_df2$Mutationinformation),] + +# sanity check +cat('Checking nrows in merged_df2_comp') +if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count + 1)){ + cat('PASS: No. of rows match' + ,'\nDim of merged_df2_comp: ' + ,'\nExpected no. of rows: ', nrow(merged_df2) - na_count + 1 + , '\nNo. of rows: ', nrow(merged_df2_comp) + , '\nNo. of cols: ', ncol(merged_df2_comp)) +}else{ + cat('FAIL: No. of rows mismatch' + ,'\nExpected no. of rows: ', nrow(merged_df2) - na_count + 1 + ,'\nGot no. of rows: ', nrow(merged_df2_comp)) +} + +######### +# d) merged_df3_comp: remove duplicate mutation information +######### +merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),] + +cat('Dim of merged_df3_comp: ' + , '\nNo. of rows: ', nrow(merged_df3_comp) + , '\nNo. of cols: ', ncol(merged_df3_comp)) + +# alternate way of deriving merged_df3_comp +foo = merged_df3[!is.na(merged_df3$AF),] +# compare dfs: foo and merged_df3_com +all.equal(foo, merged_df3) + +summary(comparedf(foo, merged_df3)) + +# sanity check +cat('Checking nrows in merged_df3_comp') +if(nrow(merged_df3_comp) == nrow(merged_df3)){ + cat('NO NAs detected in merged_df3 in AF|OR cols' + ,'\nNo. of rows are identical: ', nrow(merged_df3)) +} else{ + if(nrow(merged_df3_comp) == nrow(merged_df3) - na_count_df3) { + cat('PASS: NAs detected in merged_df3 in AF|OR cols' + , '\nNo. of NAs: ', na_count_df3 + , '\nExpected no. of rows in merged_df3_comp: ', nrow(merged_df3) - na_count_df3 + , '\nGot no. of rows: ', nrow(merged_df3_comp)) + } +} +#======================================================================= +#********************* +# writing 1 file in the style of a loop: merged_df3 +# print(output dir) +#i = 'merged_df3' +#out_filename = paste0(i, '.csv') +#outfile = paste0(outdir, '/', out_filename) + +#cat('Writing output file: ' +# ,'\nFilename: ', out_filename +# ,'\nPath: ', outdir) + +#template: write.csv(merged_df3, 'merged_df3.csv') +#write.csv(get(i), outfile, row.names = FALSE) +#cat('Finished writing: ', outfile +# , '\nNo. of rows: ', nrow(get(i)) +# , '\nNo. of cols: ', ncol(get(i))) + +#%% write_output files; all 4 files: +outvars = c('merged_df2' + , 'merged_df3' + , 'merged_df2_comp' + , 'merged_df3_comp') + +cat('Writing output files: ' + , '\nPath:', outdir) + +for (i in outvars){ +# cat(i, '\n') + out_filename = paste0(i, '.csv') +# cat(out_filename, '\n') +# cat('getting value of variable: ', get(i)) + outfile = paste0(outdir, '/', out_filename) +# cat('Full output path: ', outfile, '\n') + cat('Writing output file:' + ,'\nFilename: ', out_filename,'\n') + write.csv(get(i), outfile, row.names = FALSE) + cat('Finished writing: ', outfile + , '\nNo. of rows: ', nrow(get(i)) + , '\nNo. of cols: ', ncol(get(i)), '\n') +} + +# alternate way to replace with implicit loop +# FIXME +#sapply(outvars, function(x, y) write.csv(get(outvars), paste0(outdir, '/', outvars, '.csv'))) +#************************* +# clear variables +rm(mcsm_data, meta_with_afor, foo, drug, gene, gene_match, indir, merged_muts_u, meta_muts_u, na_count, orig_col, outdir) +rm(pos_count_check) +#%% end of script +#======================================================================= +