changed df to adults df to extract relevant info

2020-10-29 10:45:49 +00:00 · 2020-10-29 10:45:49 +00:00 · bbdd2d12e5
commit bbdd2d12e5
parent 9e5b202f5d
5 changed files with 78 additions and 1007 deletions
--- a/data_extraction_formatting.R
+++ b/data_extraction_formatting.R
@ -3,7 +3,7 @@ getwd()
 setwd('~/git/mosaic_2020/')
 getwd()
 ########################################################################
-# TASK: Extract relevant columns from mosaic data
+# TASK: Extract relevant columns from mosaic adults data
 # sam
 # serum
 # npa
@ -14,17 +14,17 @@ getwd()
 source("read_data.R")

 # clear unnecessary variables
-#rm()
+rm(all_df)
 ########################################################################

 #=========
 # sam
 #=========
 sam_regex = regex(".*_sam[1-3]{1}$", ignore_case = T)
-sam_cols_i = str_extract(colnames(all_df), sam_regex)   # not boolean  
-#sam_cols_b = colnames(all_df)%in%sam_cols_i # boolean
+sam_cols_i = str_extract(colnames(adult_df), sam_regex)   # not boolean  
+#sam_cols_b = colnames(adult_df)%in%sam_cols_i # boolean

-sam_cols = colnames(all_df)[colnames(all_df)%in%sam_cols_i]
+sam_cols = colnames(adult_df)[colnames(adult_df)%in%sam_cols_i]

 # this contains log columns +  daysamp_samXX: omitting these
 sam_regex_log_days = regex("log|day.*_sam[1-3]{1}$", ignore_case = T, perl = T)
@ -48,7 +48,7 @@ cat("Extracting SAM cols + metadata_cols")

 if ( length(sam_cols_to_extract) == length(meta_data_cols) + length(sam_cols_clean) ){
  cat("Extracing", length(sam_cols_to_extract),  "columns for sam")
-  sam_df = all_df[, sam_cols_to_extract]
+  sam_df = adult_df[, sam_cols_to_extract]
 }else{
  cat("FAIL: length mismatch"
      , "Expeceted to extract:", length(meta_data_cols) + length(sam_cols_clean), "columns"
@ -61,10 +61,10 @@ colnames_sam_df = colnames(sam_df); colnames_sam_df
 # serum
 #=========
 serum_regex = regex(".*_serum[1-3]{1}$", ignore_case = T)
-serum_cols_i = str_extract(colnames(all_df), serum_regex)   # not boolean  
-#serum_cols_b = colnames(all_df)%in%serum_cols_i # boolean
+serum_cols_i = str_extract(colnames(adult_df), serum_regex)   # not boolean  
+#serum_cols_b = colnames(adult_df)%in%serum_cols_i # boolean

-serum_cols = colnames(all_df)[colnames(all_df)%in%serum_cols_i]
+serum_cols = colnames(adult_df)[colnames(adult_df)%in%serum_cols_i]

 # this contains log columns +  dayserump_serumXX: omitting these
 serum_regex_log_days = regex("log|day.*_serum[1-3]{1}$", ignore_case = T, perl = T)
@ -88,7 +88,7 @@ cat("Extracting SERUM cols + metadata_cols")

 if ( length(serum_cols_to_extract) == length(meta_data_cols) + length(serum_cols_clean) ){
  cat("Extracing", length(serum_cols_to_extract),  "columns for serum")
-  serum_df = all_df[, serum_cols_to_extract]
+  serum_df = adult_df[, serum_cols_to_extract]
 }else{
  cat("FAIL: length mismatch"
      , "Expeceted to extract:", length(meta_data_cols) + length(serum_cols_clean), "columns"
@ -101,10 +101,10 @@ colnames_serum_df = colnames(serum_df); colnames_serum_df
 # npa
 #=========
 npa_regex = regex(".*_npa[1-3]{1}$", ignore_case = T)
-npa_cols_i = str_extract(colnames(all_df), npa_regex)   # not boolean  
-#npa_cols_b = colnames(all_df)%in%npa_cols_i # boolean
+npa_cols_i = str_extract(colnames(adult_df), npa_regex)   # not boolean  
+#npa_cols_b = colnames(adult_df)%in%npa_cols_i # boolean

-npa_cols = colnames(all_df)[colnames(all_df)%in%npa_cols_i]
+npa_cols = colnames(adult_df)[colnames(adult_df)%in%npa_cols_i]

 # this contains log columns +  daynpap_npaXX: omitting these
 npa_regex_log_days = regex("log|day|vl_samptime|ct.*_npa[1-3]{1}$", ignore_case = T, perl = T)
@ -128,7 +128,7 @@ cat("Extracting NPA cols + metadata_cols")

 if ( length(npa_cols_to_extract) == length(meta_data_cols) + length(npa_cols_clean) ){
  cat("Extracing", length(npa_cols_to_extract),  "columns for npa")
-  npa_df = all_df[, npa_cols_to_extract]
+  npa_df = adult_df[, npa_cols_to_extract]
 }else{
  cat("FAIL: length mismatch"
      , "Expeceted to extract:", length(meta_data_cols) + length(npa_cols_clean), "columns"
@ -137,8 +137,11 @@ if ( length(npa_cols_to_extract) == length(meta_data_cols) + length(npa_cols_cle

 colnames_npa_df = colnames(npa_df); colnames_npa_df

+#==============
+# quick checks
+#==============
 colnames_check = as.data.frame(cbind(colnames_sam_df, colnames_serum_df, colnames_npa_df))
-tail(colnames_check)
+tail(colnames_check) # gives a warning message due to differeing no. of rows for cbind!

 # put NA where a match doesn't exist
 # unmatched lengths
@ -168,12 +171,16 @@ quick_check = as.data.frame(cbind(metadata_all$mosaic
                                  , metadata_all$adult
                                  , metadata_all$age
                                  , metadata_all$obesity
-                                  , metadata_all$obese2))
+                                  , metadata_all$obese2
+                                  ))
 colnames(quick_check) = c("mosaic", "adult", "age", "obesity", "obese2")

 ##########################################################################
 # LF data
 ##########################################################################
+cols_to_omit = c("adult", "obese2"
+                 , "height", "height_unit", "weight"
+                 ,  "weight_unit", "visual_est_bmi", "bmi_rating")

 #==============
 # lf data: sam
@ -181,19 +188,11 @@ colnames(quick_check) = c("mosaic", "adult", "age", "obesity", "obese2")
 str(sam_df)
 table(sam_df$obesity); table(sam_df$obese2)

-sam_df_adults = sam_df[sam_df$adult == 1,]
-
-cols_to_omit = c("type"
-                 #, "flustat"
-                 #, "obesity"
-                 #, "obese2"
-                 , "height", "height_unit", "weight"
-                 ,  "weight_unit", "visual_est_bmi", "bmi_rating")
-
-#sam_df_adults_clean = sam_df_adults[!cols_to_omit]
+#sam_df_adults = sam_df[sam_df$adult == 1,]  # resolved at source and only dealing wit age as adult
+sam_df_adults = sam_df

 wf_cols =  colnames(sam_df_adults)[!colnames(sam_df_adults)%in%cols_to_omit]
-sam_df_adults_clean = sam_df_adults[wf_cols]
+sam_wf = sam_df_adults[wf_cols]

 pivot_cols = meta_data_cols
 # subselect pivot_cols
@ -208,25 +207,25 @@ if (length(pivot_cols) == length(meta_data_cols) - length(cols_to_omit)){
  quit()
 }

-expected_rows_sam_lf = nrow(sam_df_adults_clean) * (length(sam_df_adults_clean) - length(pivot_cols)); expected_rows_sam_lf
+expected_rows_sam_lf = nrow(sam_wf) * (length(sam_wf) - length(pivot_cols)); expected_rows_sam_lf

 # using regex: 
-sam_adults_lf = sam_df_adults_clean %>% 
+sam_lf = sam_wf %>% 
  tidyr::pivot_longer(-all_of(pivot_cols)
                      , names_to = c("mediator", "sample_type", "timepoint")
                      , names_pattern = "(.*)_(.*)([1-3]{1})" 
                      , values_to = "value")

 if (
-  (nrow(sam_adults_lf) == expected_rows_sam_lf) & (sum(table(is.na(sam_adults_lf$mediator))) == expected_rows_sam_lf)
+  (nrow(sam_lf) == expected_rows_sam_lf) & (sum(table(is.na(sam_lf$mediator))) == expected_rows_sam_lf)
   ) {
  cat(paste0("PASS: long format data has correct no. of rows and NA in mediator:"
-             , "\nNo. of rows: ", nrow(sam_adults_lf)
-             , "\nNo. of cols: ", ncol(sam_adults_lf)))
+             , "\nNo. of rows: ", nrow(sam_lf)
+             , "\nNo. of cols: ", ncol(sam_lf)))
 } else{
  cat(paste0("FAIL:long format data has unexpected no. of rows or NAs in mediator"
             , "\nExpected no. of rows: ", expected_rows_sam_lf
-             , "\nGot: ", nrow(sam_adults_lf)
+             , "\nGot: ", nrow(sam_lf)
             , "\ncheck expected rows calculation!"))
  quit()
 }
@ -241,11 +240,11 @@ if (
 str(serum_df)
 table(serum_df$obesity); table(serum_df$obese2)

-serum_df_adults = serum_df[serum_df$adult == 1,]
+#serum_df_adults = serum_df[serum_df$adult == 1,] # extract based on age
+serum_df_adults = serum_df

-#serum_df_adults_clean = serum_df_adults[!cols_to_omit]
 wf_cols =  colnames(serum_df_adults)[!colnames(serum_df_adults)%in%cols_to_omit]
-serum_df_adults_clean = serum_df_adults[wf_cols]
+serum_wf = serum_df_adults[wf_cols]

 pivot_cols = meta_data_cols
 pivot_cols = meta_data_cols[!meta_data_cols%in%cols_to_omit];pivot_cols 
@ -259,25 +258,25 @@ if (length(pivot_cols) == length(meta_data_cols) - length(cols_to_omit)){
  quit()
 }

-expected_rows_serum_lf = nrow(serum_df_adults_clean) * (length(serum_df_adults_clean) - length(pivot_cols)); expected_rows_serum_lf
+expected_rows_serum_lf = nrow(serum_wf) * (length(serum_wf) - length(pivot_cols)); expected_rows_serum_lf

 # using regex: 
-serum_adults_lf = serum_df_adults_clean %>% 
+serum_lf = serum_wf %>% 
  tidyr::pivot_longer(-all_of(pivot_cols)
                      , names_to = c("mediator", "sample_type", "timepoint")
                      , names_pattern = "(.*)_(.*)([1-3]{1})" 
                      , values_to = "value")

 if (
-  (nrow(serum_adults_lf) == expected_rows_serum_lf) & (sum(table(is.na(serum_adults_lf$mediator))) == expected_rows_serum_lf)
+  (nrow(serum_lf) == expected_rows_serum_lf) & (sum(table(is.na(serum_lf$mediator))) == expected_rows_serum_lf)
 ) {
  cat(paste0("PASS: long format data has correct no. of rows and NA in mediator:"
-             , "\nNo. of rows: ", nrow(serum_adults_lf)
-             , "\nNo. of cols: ", ncol(serum_adults_lf)))
+             , "\nNo. of rows: ", nrow(serum_lf)
+             , "\nNo. of cols: ", ncol(serum_lf)))
 } else{
  cat(paste0("FAIL:long format data has unexpected no. of rows or NAs in mediator"
             , "\nExpected no. of rows: ", expected_rows_serum_lf
-             , "\nGot: ", nrow(serum_adults_lf)
+             , "\nGot: ", nrow(serum_lf)
             , "\ncheck expected rows calculation!"))
  quit()
 }
@ -288,11 +287,11 @@ if (
 str(npa_df)
 table(npa_df$obesity); table(npa_df$obese2)

-npa_df_adults = npa_df[npa_df$adult == 1,]
-#npa_df_adults_clean = npa_df_adults[!cols_to_omit]
+#npa_df_adults = npa_df[npa_df$adult == 1,] # extract based on age
+npa_df_adults = npa_df

 wf_cols =  colnames(npa_df_adults)[!colnames(npa_df_adults)%in%cols_to_omit]
-npa_df_adults_clean = npa_df_adults[wf_cols]
+npa_wf = npa_df_adults[wf_cols]

 pivot_cols = meta_data_cols
 pivot_cols = meta_data_cols[!meta_data_cols%in%cols_to_omit];pivot_cols 
@ -306,25 +305,25 @@ if (length(pivot_cols) == length(meta_data_cols) - length(cols_to_omit)){
  quit()
 }

-expected_rows_npa_lf = nrow(npa_df_adults_clean) * (length(npa_df_adults_clean) - length(pivot_cols)); expected_rows_npa_lf
+expected_rows_npa_lf = nrow(npa_wf) * (length(npa_wf) - length(pivot_cols)); expected_rows_npa_lf

 # using regex: 
-npa_adults_lf = npa_df_adults_clean %>% 
+npa_lf = npa_wf %>% 
  tidyr::pivot_longer(-all_of(pivot_cols)
                      , names_to = c("mediator", "sample_type", "timepoint")
                      , names_pattern = "(.*)_(.*)([1-3]{1})" 
                      , values_to = "value")

 if (
-  (nrow(npa_adults_lf) == expected_rows_npa_lf) & (sum(table(is.na(npa_adults_lf$mediator))) == expected_rows_npa_lf)
+  (nrow(npa_lf) == expected_rows_npa_lf) & (sum(table(is.na(npa_lf$mediator))) == expected_rows_npa_lf)
 ) {
  cat(paste0("PASS: long format data has correct no. of rows and NA in mediator:"
-             , "\nNo. of rows: ", nrow(npa_adults_lf)
-             , "\nNo. of cols: ", ncol(npa_adults_lf)))
+             , "\nNo. of rows: ", nrow(npa_lf)
+             , "\nNo. of cols: ", ncol(npa_lf)))
 } else{
  cat(paste0("FAIL:long format data has unexpected no. of rows or NAs in mediator"
             , "\nExpected no. of rows: ", expected_rows_npa_lf
-             , "\nGot: ", nrow(npa_adults_lf)
+             , "\nGot: ", nrow(npa_lf)
             , "\ncheck expected rows calculation!"))
  quit()
 }
@ -334,12 +333,14 @@ if (
 rm(sam_regex, sam_regex_log_days, sam_cols, sam_cols_clean, sam_cols_i, sam_cols_to_extract, sam_cols_to_omit)
 rm(serum_regex, serum_regex_log_days, serum_cols, serum_cols_clean, serum_cols_i, serum_cols_to_extract, serum_cols_to_omit)
 rm(npa_regex, npa_regex_log_days, npa_cols, npa_cols_clean, npa_cols_i, npa_cols_to_extract, npa_cols_to_omit)
-rm(all_df)
+rm(adult_df)
 rm(colnames_check)
-rm(i, j, expected_cols, start, wf_cols, extra_cols, cols_to_omit)
+rm(i, j
+   #, expected_cols
+   , start, wf_cols, extra_cols, cols_to_omit)

 # rm not_clean dfs
 rm(sam_df_adults, serum_df_adults, npa_df_adults)

-# rm df containing non-adults
+# rm df
 rm(sam_df, serum_df, npa_df)