import commit

2020-01-08 16:15:33 +00:00 · 2020-01-08 16:15:33 +00:00 · bccfe68192
commit bccfe68192
39 changed files with 6837 additions and 0 deletions
--- a/mcsm_analysis/pyrazinamide/scripts/source_data_checks.R
+++ b/mcsm_analysis/pyrazinamide/scripts/source_data_checks.R
@ -0,0 +1,257 @@
+getwd()
+setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
+getwd()
+
+#########################################################
+# 1: Installing and loading required packages           #
+#########################################################
+
+source("Header_TT.R")
+#source("barplot_colour_function.R")
+
+##########################################################
+#           Checking: Entire data frame and for PS      #
+##########################################################
+
+###########################
+#2) Read file: combined one from the script
+###########################
+source("combining_two_df.R")
+
+# df with NA:
+# merged_df2
+# merged_df3:
+
+# df without NA:
+# merged_df2_comp:
+# merged_df3_comp:
+
+######################
+# You need to check it
+# with the merged_df3
+########################
+
+#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# REASSIGNMENT
+my_df = merged_df3
+#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+#clear variables
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# should be true
+identical(my_df$Position, my_df$position)
+
+#################################
+# Read file: normalised file
+# output of step 4 mcsm_pipeline
+#################################
+
+
+inDir = "~/git/Data/pyrazinamide/input/processed/"
+inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
+
+mcsm_data <- read.csv(inFile
+                  , row.names = 1
+                  , stringsAsFactors = F
+                  , header = T)
+str(mcsm_data)
+my_colnames  = colnames(mcsm_data)
+
+#====================================
+# subset my_df to include only the columns in mcsm data
+my_df2 = my_df[my_colnames]
+#====================================
+# compare the two
+head(mcsm_data$Mutationinformation)
+head(mcsm_data$Position)
+
+head(my_df2$Mutationinformation)
+head(my_df2$Position)
+
+# sort mcsm data by Mutationinformation
+mcsm_data_s = mcsm_data[order(mcsm_data$Mutationinformation),] 
+head(mcsm_data_s$Mutationinformation)
+head(mcsm_data_s$Position)
+
+# now compare: should be True, but is false....
+# possibly due to rownames!?!
+identical(mcsm_data_s, my_df2)
+
+# from library dplyr
+setdiff(mcsm_data_s, my_df2)
+
+#from lib compare
+compare(mcsm_data_s, my_df2) # seems rownames are the problem
+
+# FIXME: automate this
+# write files: checked using meld and files are indeed identical
+#write.csv(mcsm_data_s, "mcsm_data_s.csv", row.names = F)
+#write.csv(my_df2, "my_df2.csv", row.names = F)
+
+
+#====================================================== end of section 1
+
+
+
+##########################################################
+#             Checking: LIG(Filtered dataframe)          #
+##########################################################
+
+# clear workspace
+rm(list = ls())
+
+###########################
+#3) Read file: combined_lig from the script
+###########################
+source("combining_two_df_lig.R")
+
+# df with NA:
+# merged_df2 :
+# merged_df3:
+
+# df without NA:
+# merged_df2_comp:
+# merged_df3_comp:
+
+######################
+# You need to check it
+# with the merged_df3
+########################
+
+#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# REASSIGNMENT
+my_df = merged_df3
+#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+#clear variables
+rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
+
+# should be true
+identical(my_df$Position, my_df$position)
+
+#################################
+# Read file: normalised file
+# output of step 4 mcsm_pipeline
+#################################
+
+inDir = "~/git/Data/pyrazinamide/input/processed/"
+inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
+
+mcsm_data <- read.csv(inFile
+                      , row.names = 1
+                      , stringsAsFactors = F
+                      , header = T)
+str(mcsm_data)
+
+###########################
+# 4a: Filter/subset data: ONLY for LIGand analysis
+# Lig plots < 10Ang
+# Filter the lig plots for Dis_to_lig < 10Ang
+###########################
+# sanity checks
+upos = unique(mcsm_data$Position)
+
+# check range of distances
+max(mcsm_data$Dis_lig_Ang)
+min(mcsm_data$Dis_lig_Ang)
+
+# Lig filtered: subset data to have only values less than 10 Ang
+mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10)
+
+rm(mcsm_data) #to avoid confusion
+
+table(mcsm_data2$Dis_lig_Ang<10)
+table(mcsm_data2$Dis_lig_Ang>10)
+
+max(mcsm_data2$Dis_lig_Ang)
+min(mcsm_data2$Dis_lig_Ang)
+
+upos_f = unique(mcsm_data2$Position); upos_f
+
+# colnames of df that you will need to subset the bigger df from
+my_colnames  = colnames(mcsm_data2)
+#====================================
+# subset bigger df i.e my_df to include only the columns in mcsm data2
+my_df2 = my_df[my_colnames] 
+
+rm(my_df) #to avoid confusion
+#====================================
+# compare the two
+head(mcsm_data2$Mutationinformation)
+head(mcsm_data2$Position)
+
+head(my_df2$Mutationinformation)
+head(my_df2$Position)
+
+# sort mcsm data by Mutationinformation
+mcsm_data2_s = mcsm_data2[order(mcsm_data2$Mutationinformation),] 
+head(mcsm_data2_s$Mutationinformation)
+head(mcsm_data2_s$Position)
+
+# now compare: should be True, but is false....
+# possibly due to rownames!?!
+identical(mcsm_data2_s, my_df2)
+
+# from library dplyr
+setdiff(mcsm_data2_s, my_df2)
+
+# from library compare
+compare(mcsm_data2_s, my_df2) # seems rownames are the problem
+
+#FIXME: automate this
+# write files: checked using meld and files are indeed identical
+#write.csv(mcsm_data2_s, "mcsm_data2_s.csv", row.names = F)
+#write.csv(my_df2, "my_df2.csv", row.names = F)
+
+
+##########################################################
+#  extract and write output file for SNP posn: all     #
+##########################################################
+
+head(merged_df3$Position)
+
+foo = merged_df3[order(merged_df3$Position),]
+head(foo$Position)
+
+snp_pos_unique = unique(foo$Position); snp_pos_unique
+
+# sanity check: 
+table(snp_pos_unique == combined_df$Position)
+
+#=====================
+# write_output files
+#=====================
+outDir = "~/Data/pyrazinamide/input/processed/"
+
+
+outFile1 = paste0(outDir, "snp_pos_unique.txt"); outFile1
+print(paste0("Output file name and path will be:","", outFile1))
+
+write.table(snp_pos_unique
+            , outFile1
+            , row.names = F
+            , col.names = F)
+            
+##############################################################
+#  extract and write output file for SNP posn: complete only #
+##############################################################
+head(merged_df3_comp$Position)
+
+foo = merged_df3_comp[order(merged_df3_comp$Position),]
+head(foo$Position)
+
+snp_pos_unique = unique(foo$Position); snp_pos_unique 
+
+# outDir = "~/Data/pyrazinamide/input/processed/" # already set
+
+outFile2 = paste0(outDir, "snp_pos_unique_comp.txt")
+print(paste0("Output file name and path will be:", outFile2))
+
+write.table(snp_pos_unique
+            , outFile2
+            , row.names = F
+            , col.names = F)
+#============================== end of script
+
+