Merge branch 'master' of github.com:tgttunstall/LSHTM_analysis

This commit is contained in:
Tanushree Tunstall 2020-09-10 16:14:46 +01:00
commit 5102bbea1b
21 changed files with 2132 additions and 243 deletions

View file

@ -6,6 +6,7 @@
# 2) <gene>_meta_data.csv
# Output:
# 1) muts with opposite effects on stability
# 2) large combined df including NAs for AF, OR,etc
# Dim: same no. of rows as gene associated meta_data_with_AFandOR
# 3) small combined df including NAs for AF, OR, etc.
@ -36,17 +37,23 @@ source("plotting_data.R")
# dup_muts
cat("Directories imported:"
, "\n===================="
, "\ndatadir:", datadir
, "\nindir:", indir
, "\noutdir:", outdir
, "\nplotdir:", plotdir)
cat("Variables imported:"
, "\n====================="
, "\ndrug:", drug
, "\ngene:", gene
, "\ngene_match:", gene_match
, "\nLength of upos:", length(upos)
, "\nAngstrom symbol:", angstroms_symbol)
, "\nAngstrom symbol:", angstroms_symbol
, "\nNo. of duplicated muts:", dup_muts_nu
, "\ndr_muts_col:", dr_muts_col
, "\nother_muts_col:", other_muts_col
, "\ndrtype_col:", resistance_col)
# clear excess variable
rm(my_df, upos, dup_muts)
@ -57,7 +64,6 @@ rm(my_df, upos, dup_muts)
#in_file1: output of plotting_data.R
# my_df_u
# infile 2: gene associated meta data
#in_filename_gene_metadata = paste0(tolower(gene), "_meta_data_with_AFandOR.csv")
in_filename_gene_metadata = paste0(tolower(gene), "_metadata.csv")
@ -94,6 +100,8 @@ gene_metadata <- read.csv(infile_gene_metadata
, header = T)
cat("Dim:", dim(gene_metadata))
table(gene_metadata$mutation_info)
# counting NAs in AF, OR cols
# or_mychisq
@ -124,6 +132,7 @@ if (identical(sum(is.na(my_df_u$or_kin))
, "\nNA in AF:", sum(is.na(my_df_u$af_kin)))
}
str(gene_metadata)
###################################################################
# combining: PS
@ -145,7 +154,7 @@ merging_cols = intersect(colnames(my_df_u), colnames(gene_metadata))
cat(paste0("Merging dfs with NAs: big df (1-many relationship b/w id & mut)"
, "\nNo. of merging cols:", length(merging_cols)
, "\nMerging columns identified:\n"))
, "\nMerging columns identified:"))
print(merging_cols)
# important checks!
@ -160,7 +169,7 @@ merged_df2 = merge(x = gene_metadata
, by = merging_cols
, all.y = T)
cat("Dim of merged_df2: ", dim(merged_df2), "\n")
cat("Dim of merged_df2: ", dim(merged_df2))
head(merged_df2$position)
# sanity check
@ -170,10 +179,10 @@ if(nrow(gene_metadata) == nrow(merged_df2)){
,"\nExpected no. of rows: ",nrow(gene_metadata)
,"\nGot no. of rows: ", nrow(merged_df2))
} else{
cat("\nFAIL: nrow(merged_df2)!= nrow(gene associated gene_metadata)"
cat("FAIL: nrow(merged_df2)!= nrow(gene associated gene_metadata)"
, "\nExpected no. of rows after merge: ", nrow(gene_metadata)
, "\nGot no. of rows: ", nrow(merged_df2)
, "\nFinding discrepancy\n")
, "\nFinding discrepancy")
merged_muts_u = unique(merged_df2$mutationinformation)
meta_muts_u = unique(gene_metadata$mutationinformation)
# find the index where it differs
@ -227,6 +236,7 @@ if (identical(sum(is.na(merged_df3$or_kin))
, "\nNA in AF:", sum(is.na(merged_df3$af_kin)))
}
#=========================
# Merge3: merged_df2_comp
# same as merge 1 but excluding NAs from ORs, etc.
@ -237,6 +247,7 @@ cat("Merging dfs without any NAs: big df (1-many relationship b/w id & mut)"
na_count_df2 = sum(is.na(merged_df2$af))
merged_df2_comp = merged_df2[!is.na(merged_df2$af),]
# sanity check: no +-1 gymnastics
cat("Checking nrows in merged_df2_comp")
if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count_df2)){
@ -246,20 +257,22 @@ if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count_df2)){
, "\nNo. of rows: ", nrow(merged_df2_comp)
, "\nNo. of cols: ", ncol(merged_df2_comp))
}else{
cat("FAIL: No. of rows mismatch"
,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2
,"\nGot no. of rows: ", nrow(merged_df2_comp))
cat("FAIL: No. of rows mismatch"
,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2
,"\nGot no. of rows: ", nrow(merged_df2_comp))
}
#=========================
# Merge4: merged_df3_comp
# same as merge 2 but excluding NAs from ORs, etc or
# remove duplicate mutation information
#=========================
na_count_df3 = sum(is.na(merged_df3$af))
#merged_df3_comp = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] # a way
merged_df3_comp = merged_df3[!is.na(merged_df3$af),] # another way
cat("\nChecking nrows in merged_df3_comp")
cat("Checking nrows in merged_df3_comp")
if(nrow(merged_df3_comp) == (nrow(merged_df3) - na_count_df3)){
cat("\nPASS: No. of rows match"
,"\nDim of merged_df3_comp: "
@ -267,12 +280,11 @@ if(nrow(merged_df3_comp) == (nrow(merged_df3) - na_count_df3)){
, "\nNo. of rows: ", nrow(merged_df3_comp)
, "\nNo. of cols: ", ncol(merged_df3_comp))
}else{
cat("\nFAIL: No. of rows mismatch"
cat("FAIL: No. of rows mismatch"
,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3
, "\nGot no. of rows: ", nrow(merged_df3_comp))
,"\nGot no. of rows: ", nrow(merged_df3_comp))
}
# alternate way of deriving merged_df3_comp
foo = merged_df3[!is.na(merged_df3$af),]
bar = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),]
@ -307,7 +319,8 @@ all.equal(foo, bar)
# clear variables
rm(foo, bar, gene_metadata
, in_filename_params, infile_params, merging_cols
, in_filename_gene_metadata, infile_gene_metadata)
, in_filename_gene_metadata, infile_gene_metadata
, merged_df2v2, merged_df2v3)
#*************************
#####################################################################
# Combining: LIG
@ -334,4 +347,4 @@ if (nrow(merged_df3_lig) == nrow(my_df_u_lig)){
#==========================================================================
# end of script
##=========================================================================
##==========================================================================