reran to generate merged_df3 with correct dst for dst muts. modified combining_dfs_plotting.R

This commit is contained in:
Tanushree Tunstall 2022-07-08 21:33:57 +01:00
parent 289c8913d0
commit 8079dd7b6c
6 changed files with 148 additions and 211 deletions

View file

@ -204,6 +204,9 @@ combining_dfs_plotting <- function( my_df_u
merged_df2$lineage_labels = merged_df2$lineage
#merged_df2$lineage_labels = as.factor(merged_df2$lineage_labels)
#merged_df2$lineage_labels = factor(merged_df2$lineage_labels)
table(merged_df2$mutation_info_labels_orig) # original
table(merged_df2$mutation_info_labels_v1) # intermediate
table(merged_df2$mutation_info_labels) # revised, corresponding to dst_mode
#=================================================================
# Merge 2: merged_df3
@ -214,17 +217,123 @@ combining_dfs_plotting <- function( my_df_u
# but this should be good for the numerical corr plots
#==================================================================
# remove duplicated mutations
cat("\nMerging dfs without NAs: small df (removing muts with no AF|OR associated)"
,"\nCannot trust lineage info from this"
,"\nlinking col: mutationinforamtion"
,"\nfilename: merged_df3")
# cat("\nMerging dfs without NAs: small df (removing muts with no AF|OR associated)"
# ,"\nCannot trust lineage info from this"
# ,"\nlinking col: mutationinforamtion"
# ,"\nfilename: merged_df3")
#
# merged_df3 = merged_df2[!duplicated(merged_df2$mutationinformation),]
#
#
# head(merged_df3$position); tail(merged_df3$position) # should be sorted
#
# # sanity check
# cat("\nChecking nrows in merged_df3")
#
# if( nrow(my_df_u) == nrow(merged_df3) ){
# cat("\nPASS: No. of rows match with my_df"
# ,"\nExpected no. of rows: ", nrow(my_df_u)
# ,"\nGot no. of rows: ", nrow(merged_df3))
# } else {
# cat("\nFAIL: No. of rows mismatch"
# , "\nNo. of rows my_df: ", nrow(my_df_u)
# , "\nNo. of rows merged_df3: ", nrow(merged_df3))
# quit()
# }
#
# counting NAs in AF, OR cols in merged_df3
# this is because mcsm has no AF, OR cols,
# so you cannot count NAs
# if (identical(sum(is.na(merged_df3$or_kin))
# , sum(is.na(merged_df3$pwald_kin))
# , sum(is.na(merged_df3$af_kin)))){
# cat("\nPASS: NA count match for OR, pvalue and AF\n")
# na_count_df3 = sum(is.na(merged_df3$af_kin))
# cat("\nNo. of NAs: ", sum(is.na(merged_df3$or_kin)))
# } else{
# cat("\nFAIL: NA count mismatch"
# , "\nNA in OR: ", sum(is.na(merged_df3$or_kin))
# , "\nNA in pvalue: ", sum(is.na(merged_df3$pwald_kin))
# , "\nNA in AF:", sum(is.na(merged_df3$af_kin)))
# }
#
# ===================================
# Revised way to generate merged_df3
# ===================================
#%% Getting merged_df3: VERY important and careful subsetting merging
# dst mode column as carefully curated dst based on knowledge based approach.
# so now we want to get the
na_muts = merged_df2[is.na(merged_df2$dst),]
no_na_muts = merged_df2[!is.na(merged_df2$dst),]
muts_na_U = na_muts[!duplicated(na_muts[c('mutationinformation')]), ]
muts_no_na_U = no_na_muts[!duplicated(no_na_muts[c('mutationinformation')]), ]
# get muts from no_na that are NOT present in muts with na from dplyr
dst_muts = dplyr::anti_join(muts_no_na_U, muts_na_U, by = 'mutationinformation')
#dst_muts = anti_join(muts_no_na_U, muts_na_U, by = 'mutationinformation')
# ALL good muts are NOT in na muts unique i.e dst muts should NOT exist in na_muts
if (all(dst_muts$mutationinformation%in%muts_na_U$mutationinformation) == FALSE){
cat("\nPASS: checked length for dst tested muts"
, "\nNo. of dst testetd muts:", nrow(dst_muts))
}else{
stop("Dst muts are not correctly identified")
}
if ( class(dst_muts) != "data.frame" ){
dst_muts = as.data.frame(dst_muts)
} else{
cat("\ndst_muts is a df")
}
# ALL bad muts are in na muts unique
bad_muts = dplyr::semi_join(muts_no_na_U, muts_na_U, by = "mutationinformation")
#bad_muts = semi_join(muts_no_na_U, muts_na_U, by = "mutationinformation")
if (all(bad_muts$mutationinformation%in%muts_na_U$mutationinformation) == TRUE){
cat("\nPASS: checked length of NOT-dst tested muts"
, "\nNo. of NOT dst-tested_muts:", nrow(bad_muts))
}else{
stop("Non-dst muts are not correctly identified")
}
if ( class(bad_muts) != "data.frame" ){
bad_muts = as.data.frame(bad_muts)
} else{
cat("\nbad_muts is a df")
}
cat("\nNo. of muts with dst:", nrow(dst_muts)
, "\nNo. of muts without dst:", nrow(muts_na_U) - nrow(dst_muts) )
# now merge
if ( all(colnames(muts_na_U) == colnames(dst_muts)) ){
cat("\nPASS: rowbind to get merged_df3")
merged_df3 = dplyr::bind_rows(muts_na_U, dst_muts)
#merged_df3 = bind_rows(muts_na_U, dst_muts)
} else{
stop("Quitting: merged_df3 could not be generated")
}
if ( nrow(merged_df3) == length(unique(merged_df2$mutationinformation)) ){
cat("\nPASS: merged_df3 sucessfully generated..."
, "\nnrow merged_df3:", nrow(merged_df3)
, "\nncol merged_df3:", ncol(merged_df3))
}else{
stop("Cannot generate merged_df3")
}
##################################################################
merged_df3 = merged_df2[!duplicated(merged_df2$mutationinformation),]
head(merged_df3$position); tail(merged_df3$position) # should be sorted
# sanity check
cat("\nChecking nrows in merged_df3")
if(nrow(my_df_u) == nrow(merged_df3)){
if( nrow(my_df_u) == nrow(merged_df3) ){
cat("\nPASS: No. of rows match with my_df"
,"\nExpected no. of rows: ", nrow(my_df_u)
,"\nGot no. of rows: ", nrow(merged_df3))
@ -234,166 +343,9 @@ combining_dfs_plotting <- function( my_df_u
, "\nNo. of rows merged_df3: ", nrow(merged_df3))
quit()
}
# counting NAs in AF, OR cols in merged_df3
# this is because mcsm has no AF, OR cols,
# so you cannot count NAs
if (identical(sum(is.na(merged_df3$or_kin))
, sum(is.na(merged_df3$pwald_kin))
, sum(is.na(merged_df3$af_kin)))){
cat("\nPASS: NA count match for OR, pvalue and AF\n")
na_count_df3 = sum(is.na(merged_df3$af_kin))
cat("\nNo. of NAs: ", sum(is.na(merged_df3$or_kin)))
} else{
cat("\nFAIL: NA count mismatch"
, "\nNA in OR: ", sum(is.na(merged_df3$or_kin))
, "\nNA in pvalue: ", sum(is.na(merged_df3$pwald_kin))
, "\nNA in AF:", sum(is.na(merged_df3$af_kin)))
}
#===================================================
# Merge3: merged_df2_comp
# same as merge 1 but excluding NAs from ORs, etc.
#====================================================
cat("\nMerging dfs without any NAs: big df (1-many relationship b/w id & mut)"
,"\nfilename: merged_df2_comp")
na_count_df2 = sum(is.na(merged_df2$af))
merged_df2_comp = merged_df2[!is.na(merged_df2$af),]
# sanity check: no +-1 gymnastics
cat("\nChecking nrows in merged_df2_comp")
if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count_df2)){
cat("\nPASS: No. of rows match"
,"\nDim of merged_df2_comp: "
,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2
, "\nNo. of rows: ", nrow(merged_df2_comp)
, "\nNo. of cols: ", ncol(merged_df2_comp))
}else{
cat("\nFAIL: No. of rows mismatch"
,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2
,"\nGot no. of rows: ", nrow(merged_df2_comp))
}
#======================================================
# Merge4: merged_df3_comp
# same as merge 2 but excluding NAs from ORs, etc or
# remove duplicate mutation information
#=======================================================
na_count_df3 = sum(is.na(merged_df3$af))
#merged_df3_comp = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] # a way
merged_df3_comp = merged_df3[!is.na(merged_df3$af),] # another way
cat("\nChecking nrows in merged_df3_comp")
if(nrow(merged_df3_comp) == (nrow(merged_df3) - na_count_df3)){
cat("\nPASS: No. of rows match"
,"\nDim of merged_df3_comp: "
,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3
, "\nNo. of rows: ", nrow(merged_df3_comp)
, "\nNo. of cols: ", ncol(merged_df3_comp))
}else{
cat("\nFAIL: No. of rows mismatch"
,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3
,"\nGot no. of rows: ", nrow(merged_df3_comp))
}
# alternate way of deriving merged_df3_comp
foo = merged_df3[!is.na(merged_df3$af),]
bar = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),]
# compare dfs: foo and merged_df3_com
all.equal(foo, bar)
#summary(comparedf(foo, bar))
cat("\n------------------------"
, "\nSummary of created dfs:"
, "\n------------------------"
, "\n1) Dim of merged_df2: " , nrow(merged_df2), "," , ncol(merged_df2)
, "\n2) Dim of merged_df2_comp: " , nrow(merged_df2_comp), "," , ncol(merged_df2_comp)
, "\n3) Dim of merged_df3: " , nrow(merged_df3), "," , ncol(merged_df3)
, "\n4) Dim of merged_df3_comp: " , nrow(merged_df3_comp), "," , ncol(merged_df3_comp))
#####################################################################
# Combining: LIG
#####################################################################
#============
# Merges 5-8
#============
cat("\n=========================================="
, "\nStarting filtering for mcsm ligand df"
, "\n===========================================")
if (lig_dist_colname%in%names(my_df_u)){
cat("\nFiltering column: ", lig_dist_colname
, "\nCut off criteria: ", lig_dist_cutoff, "Angstroms")
df_lig = my_df_u[my_df_u[[lig_dist_colname]] < lig_dist_cutoff,]
#merged_df2_lig = merged_df2[merged_df2$ligand_distance<lig_dist_cutoff,]
merged_df2_lig = merged_df2[merged_df2[[lig_dist_colname]] < lig_dist_cutoff,]
dim(merged_df2_lig)
merged_df2_comp_lig = merged_df2_comp[merged_df2_comp[[lig_dist_colname]] < lig_dist_cutoff,]
merged_df3_lig = merged_df3[merged_df3[[lig_dist_colname]] < lig_dist_cutoff,]
merged_df3_comp_lig = merged_df3_comp[merged_df3_comp[[lig_dist_colname]] < lig_dist_cutoff,]
cat("\n------------------------"
, "\nSummary of created ligand dfs:"
, "\n------------------------"
, "\n1) Dim of merged_df2_lig: " , nrow(merged_df2_lig), "," , ncol(merged_df2_lig)
, "\n2) Dim of merged_df2_comp_lig: " , nrow(merged_df2_comp_lig), "," , ncol(merged_df2_comp_lig)
, "\n3) Dim of merged_df3_lig: " , nrow(merged_df3_lig), "," , ncol(merged_df3_lig)
, "\n4) Dim of merged_df3_comp_lig: " , nrow(merged_df3_comp_lig), "," , ncol(merged_df3_comp_lig))
} else {
cat("\nFiltering column: ", lig_dist_colname, " not found\n")
}
#quit()
# sanity check
if (nrow(merged_df3_lig) == nrow(df_lig)){
print("\nPASS: verified merged_df3_lig")
}else{
cat(paste0("\nFAIL: nrow mismatch for merged_df3_lig"
, "\nExpected:", nrow(df_lig)
, "\nGot:", nrow(merged_df3_lig)))
}
#==============================================================
############################################
# OPTIONAL: write output files in one go
############################################
#outvars = c(#"merged_df2",
#"merged_df2_comp",
#"merged_df2_lig",
#"merged_df2_comp_lig",
#"meregd_df3_comp"
#"merged_df3_comp_lig",
#"merged_df3",
#"merged_df3_lig")
#cat("Writing output files: "
#, "\nPath:", outdir)
#for (i in outvars){
#out_filename = paste0(i, ".csv")
#outfile = paste0(outdir, "/", out_filename)
#cat("Writing output file:"
# ,"\nFilename: ", out_filename,"\n")
#write.csv(get(i), outfile, row.names = FALSE)
#cat("Finished writing: ", outfile
# , "\nNo. of rows: ", nrow(get(i))
# , "\nNo. of cols: ", ncol(get(i)), "\n")
#}
return(list( merged_df2
, merged_df3
, merged_df2_comp
, merged_df3_comp
, merged_df2_lig
, merged_df3_lig
, merged_df2_comp_lig
, merged_df3_comp_lig))
, merged_df3
))
cat("\nEnd of combining_dfs_plotting.R script")
}