getwd() setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts") getwd() ######################################################### # 1: Installing and loading required packages # ######################################################### source("Header_TT.R") #source("barplot_colour_function.R") ########################################################## # Checking: Entire data frame and for PS # ########################################################## ########################### #2) Read file: combined one from the script ########################### source("combining_two_df.R") # df with NA: # merged_df2 # merged_df3: # df without NA: # merged_df2_comp: # merged_df3_comp: ###################### # You need to check it # with the merged_df3 ######################## #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # REASSIGNMENT my_df = merged_df3 #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% #clear variables rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) # should be true identical(my_df$Position, my_df$position) ################################# # Read file: normalised file # output of step 4 mcsm_pipeline ################################# inDir = "~/git/Data/pyrazinamide/input/processed/" inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile mcsm_data <- read.csv(inFile , row.names = 1 , stringsAsFactors = F , header = T) str(mcsm_data) my_colnames = colnames(mcsm_data) #==================================== # subset my_df to include only the columns in mcsm data my_df2 = my_df[my_colnames] #==================================== # compare the two head(mcsm_data$Mutationinformation) head(mcsm_data$Position) head(my_df2$Mutationinformation) head(my_df2$Position) # sort mcsm data by Mutationinformation mcsm_data_s = mcsm_data[order(mcsm_data$Mutationinformation),] head(mcsm_data_s$Mutationinformation) head(mcsm_data_s$Position) # now compare: should be True, but is false.... # possibly due to rownames!?! identical(mcsm_data_s, my_df2) # from library dplyr setdiff(mcsm_data_s, my_df2) #from lib compare compare(mcsm_data_s, my_df2) # seems rownames are the problem # FIXME: automate this # write files: checked using meld and files are indeed identical #write.csv(mcsm_data_s, "mcsm_data_s.csv", row.names = F) #write.csv(my_df2, "my_df2.csv", row.names = F) #====================================================== end of section 1 ########################################################## # Checking: LIG(Filtered dataframe) # ########################################################## # clear workspace rm(list = ls()) ########################### #3) Read file: combined_lig from the script ########################### source("combining_two_df_lig.R") # df with NA: # merged_df2 : # merged_df3: # df without NA: # merged_df2_comp: # merged_df3_comp: ###################### # You need to check it # with the merged_df3 ######################## #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # REASSIGNMENT my_df = merged_df3 #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% #clear variables rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) # should be true identical(my_df$Position, my_df$position) ################################# # Read file: normalised file # output of step 4 mcsm_pipeline ################################# inDir = "~/git/Data/pyrazinamide/input/processed/" inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile mcsm_data <- read.csv(inFile , row.names = 1 , stringsAsFactors = F , header = T) str(mcsm_data) ########################### # 4a: Filter/subset data: ONLY for LIGand analysis # Lig plots < 10Ang # Filter the lig plots for Dis_to_lig < 10Ang ########################### # sanity checks upos = unique(mcsm_data$Position) # check range of distances max(mcsm_data$Dis_lig_Ang) min(mcsm_data$Dis_lig_Ang) # Lig filtered: subset data to have only values less than 10 Ang mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10) rm(mcsm_data) #to avoid confusion table(mcsm_data2$Dis_lig_Ang<10) table(mcsm_data2$Dis_lig_Ang>10) max(mcsm_data2$Dis_lig_Ang) min(mcsm_data2$Dis_lig_Ang) upos_f = unique(mcsm_data2$Position); upos_f # colnames of df that you will need to subset the bigger df from my_colnames = colnames(mcsm_data2) #==================================== # subset bigger df i.e my_df to include only the columns in mcsm data2 my_df2 = my_df[my_colnames] rm(my_df) #to avoid confusion #==================================== # compare the two head(mcsm_data2$Mutationinformation) head(mcsm_data2$Position) head(my_df2$Mutationinformation) head(my_df2$Position) # sort mcsm data by Mutationinformation mcsm_data2_s = mcsm_data2[order(mcsm_data2$Mutationinformation),] head(mcsm_data2_s$Mutationinformation) head(mcsm_data2_s$Position) # now compare: should be True, but is false.... # possibly due to rownames!?! identical(mcsm_data2_s, my_df2) # from library dplyr setdiff(mcsm_data2_s, my_df2) # from library compare compare(mcsm_data2_s, my_df2) # seems rownames are the problem #FIXME: automate this # write files: checked using meld and files are indeed identical #write.csv(mcsm_data2_s, "mcsm_data2_s.csv", row.names = F) #write.csv(my_df2, "my_df2.csv", row.names = F) ########################################################## # extract and write output file for SNP posn: all # ########################################################## head(merged_df3$Position) foo = merged_df3[order(merged_df3$Position),] head(foo$Position) snp_pos_unique = unique(foo$Position); snp_pos_unique # sanity check: table(snp_pos_unique == combined_df$Position) #===================== # write_output files #===================== outDir = "~/Data/pyrazinamide/input/processed/" outFile1 = paste0(outDir, "snp_pos_unique.txt"); outFile1 print(paste0("Output file name and path will be:","", outFile1)) write.table(snp_pos_unique , outFile1 , row.names = F , col.names = F) ############################################################## # extract and write output file for SNP posn: complete only # ############################################################## head(merged_df3_comp$Position) foo = merged_df3_comp[order(merged_df3_comp$Position),] head(foo$Position) snp_pos_unique = unique(foo$Position); snp_pos_unique # outDir = "~/Data/pyrazinamide/input/processed/" # already set outFile2 = paste0(outDir, "snp_pos_unique_comp.txt") print(paste0("Output file name and path will be:", outFile2)) write.table(snp_pos_unique , outFile2 , row.names = F , col.names = F) #============================== end of script