diff --git a/scripts/AF_and_OR_calcs.R b/scripts/AF_and_OR_calcs.R index d32c70d..292061d 100644 --- a/scripts/AF_and_OR_calcs.R +++ b/scripts/AF_and_OR_calcs.R @@ -481,182 +481,7 @@ cat('End of script: calculated AF, OR, pvalues and saved file') ######################################################### # 3: Merge meta data file + calculated num params ######################################################### -df1 = gene_metadata -df2 = comb_AF_and_OR - -cat('checking commom col of the two dfs before merging:' - ,'\ndf1:', head(df1$mutation) - , '\ndf2:', head(df2$mutation)) - -cat(paste0('merging two dfs: ' - ,'\ndf1 (big df i.e. meta data) nrows: ', nrow(df1) - ,'\ndf2 (small df i.e af, or, pval) nrows: ', nrow(df2) - ,'\nexpected rows in merged df: ', nrow(df1) - ,'\nexpected cols in merged_df: ', (ncol(df1) + ncol(df2) - 1))) - -merged_df = merge(df1 # big file - , df2 # small (afor file) - , by = "mutation" - , all.x = T) # because you want all the entries of the meta data - -# sanity check -if(ncol(merged_df) == (ncol(df1) + ncol(df2) - 1)){ - cat(paste0('PASS: no. of cols is as expected: ', ncol(merged_df))) -} else{ - cat('FAIL: no.of cols mistmatch') -} - -# quick check -i = "pnca_p.ala134gly" # has all NAs in pyrazinamide, should be NA in ors, etc. -merged_df[merged_df$mutation == i,] - -# count na in each column -na_count = sapply(merged_df, function(y) sum(length(which(is.na(y))))); na_count - -# check last three cols: should be NA -if ( identical(na_count[[length(na_count)]], na_count[[length(na_count)-1]], na_count[[length(na_count)-2]])){ - cat('PASS: No. of NAs for OR, AF and Pvals are equal as expected', - '\nNo. of NA: ', na_count[[length(na_count)]]) -} else { - cat('FAIL: No. of NAs for OR, AF and Pvals mismatch') -} - -# reassign custom colnames -#cat('Assigning custom colnames for the calculated params...') -#colnames(merged_df)[colnames(merged_df)== "ors"] <- "OR" -#colnames(merged_df)[colnames(merged_df)== "pvals"] <- "pvalue" -#colnames(merged_df)[colnames(merged_df)== "afs"] <- "AF" - -colnames(merged_df) - -# add 3 more cols: log OR, neglog pvalue and AF_percent cols -merged_df$logor = log(merged_df$OR) -is.numeric(merged_df$logor) - -merged_df$neglog10pvalue = -log10(merged_df$pvalue) -is.numeric(merged_df$neglog10pvalue) - -merged_df$AF_percent = merged_df$AF*100 -is.numeric(merged_df$AF_percent) - -# check AFs -#i = 'pnca_p.trp68gly' -i = 'pnca_p.gln10pro' -#i = 'pnca_p.leu4ser' -merged_df[merged_df$mutation == i,] - -# FIXME: harcoding (beware!), NOT FATAL though! -ncol_added = 3 - -cat(paste0('Added', ' ', ncol_added, ' more cols to merged_df:' - , '\ncols added: logor, neglog10pvalue and AF_percent:' - , '\nno. of cols in merged_df now: ', ncol(merged_df))) - -#%% write file out: pnca_meta_data_with_AF_OR -#********************************************* -cat(paste0('writing output file: ' - , '\nFilename: ', out_filename - , '\nPath:', outdir)) - -write.csv(merged_df, outfile - , row.names = F) - -cat(paste0('Finished writing:' - , out_filename - , '\nNo. of rows: ', nrow(merged_df) - , '\nNo. of cols: ', ncol(merged_df))) -#************************************************ -cat('======================================================================') -rm(out_filename) -cat('End of script: calculated AF, OR, pvalues and saved file') -# End of script -#%% -# sanity check: Count NA in these four cols. -# However these need to be numeric else these -# will be misleading when counting NAs (i.e retrun 0) -#is.numeric(meta_with_afor$OR) -na_var = c('AF', 'OR', 'pvalue', 'logor', 'neglog10pvalue') - -# loop through these vars and check if these are numeric. -# if not, then convert to numeric -check_all = NULL - -for (i in na_var){ - # cat(i) - check0 = is.numeric(meta_with_afor[,i]) - if (check0) { - check_all = c(check0, check_all) - cat('These are all numeric cols') - } else{ - cat('First converting to numeric') - check0 = as.numeric(meta_with_afor[,i]) - check_all = c(check0, check_all) - } -} - -# count na now that the respective cols are numeric -na_count = sapply(meta_with_afor, function(y) sum(length(which(is.na(y))))); na_count -str(na_count) - -# extract how many NAs: -# should be all TRUE -# should be a single number since -# all the cols should have 'equal' and 'same' no. of NAs -# compare if the No of 'NA' are the same for all these cols -na_len = NULL -for (i in na_var){ - temp = na_count[[i]] - na_len = c(na_len, temp) -} - -cat('Checking how many NAs and if these are identical for the selected cols:') -my_nrows = NULL -for ( i in 1: (length(na_len)-1) ){ -# cat(compare(na_len[i]), na_len[i+1]) - c = compare(na_len[i], na_len[i+1]) - if ( c$result ) { - cat('PASS: No. of NAa in selected cols are identical') - my_nrows = na_len[i] } - else { - cat('FAIL: No. of NAa in selected cols mismatch') - } -} - -cat('No. of NAs in each of the selected cols: ', my_nrows) - -# yet more sanity checks: -cat('Check whether the ', my_nrows, 'indices are indeed the same') - -#which(is.na(meta_with_afor$OR)) - -# initialise an empty df with nrows as extracted above -na_count_df = data.frame(matrix(vector(mode = 'numeric' -# , length = length(na_var) - ) - , nrow = my_nrows -# , ncol = length(na_var) - )) - -# populate the df with the indices of the cols that are NA -for (i in na_var){ - cat(i) - na_i = which(is.na(meta_with_afor[i])) - na_count_df = cbind(na_count_df, na_i) - colnames(na_count_df)[which(na_var == i)] <- i -} - -# Now compare these indices to ensure these are the same -check2 = NULL -for ( i in 1: ( length(na_count_df)-1 ) ) { -# cat(na_count_df[i] == na_count_df[i+1]) - check_all = identical(na_count_df[[i]], na_count_df[[i+1]]) - check2 = c(check_all, check2) - if ( all(check2) ) { - cat('PASS: The indices for AF, OR, etc are all the same\n') - } else { - cat ('FAIL: Please check indices which are NA') - } -} - - +#df1 = gene_metadata +#df2 = comb_AF_and_OR +# COMMENT: will do the combining with the other OR and AF (in python) \ No newline at end of file