LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/source_data_checks.R
2020-01-08 16:15:33 +00:00

257 lines
6.6 KiB
R

getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
getwd()
#########################################################
# 1: Installing and loading required packages #
#########################################################
source("Header_TT.R")
#source("barplot_colour_function.R")
##########################################################
# Checking: Entire data frame and for PS #
##########################################################
###########################
#2) Read file: combined one from the script
###########################
source("combining_two_df.R")
# df with NA:
# merged_df2
# merged_df3:
# df without NA:
# merged_df2_comp:
# merged_df3_comp:
######################
# You need to check it
# with the merged_df3
########################
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
my_df = merged_df3
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#clear variables
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# should be true
identical(my_df$Position, my_df$position)
#################################
# Read file: normalised file
# output of step 4 mcsm_pipeline
#################################
inDir = "~/git/Data/pyrazinamide/input/processed/"
inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
mcsm_data <- read.csv(inFile
, row.names = 1
, stringsAsFactors = F
, header = T)
str(mcsm_data)
my_colnames = colnames(mcsm_data)
#====================================
# subset my_df to include only the columns in mcsm data
my_df2 = my_df[my_colnames]
#====================================
# compare the two
head(mcsm_data$Mutationinformation)
head(mcsm_data$Position)
head(my_df2$Mutationinformation)
head(my_df2$Position)
# sort mcsm data by Mutationinformation
mcsm_data_s = mcsm_data[order(mcsm_data$Mutationinformation),]
head(mcsm_data_s$Mutationinformation)
head(mcsm_data_s$Position)
# now compare: should be True, but is false....
# possibly due to rownames!?!
identical(mcsm_data_s, my_df2)
# from library dplyr
setdiff(mcsm_data_s, my_df2)
#from lib compare
compare(mcsm_data_s, my_df2) # seems rownames are the problem
# FIXME: automate this
# write files: checked using meld and files are indeed identical
#write.csv(mcsm_data_s, "mcsm_data_s.csv", row.names = F)
#write.csv(my_df2, "my_df2.csv", row.names = F)
#====================================================== end of section 1
##########################################################
# Checking: LIG(Filtered dataframe) #
##########################################################
# clear workspace
rm(list = ls())
###########################
#3) Read file: combined_lig from the script
###########################
source("combining_two_df_lig.R")
# df with NA:
# merged_df2 :
# merged_df3:
# df without NA:
# merged_df2_comp:
# merged_df3_comp:
######################
# You need to check it
# with the merged_df3
########################
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
my_df = merged_df3
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#clear variables
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# should be true
identical(my_df$Position, my_df$position)
#################################
# Read file: normalised file
# output of step 4 mcsm_pipeline
#################################
inDir = "~/git/Data/pyrazinamide/input/processed/"
inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
mcsm_data <- read.csv(inFile
, row.names = 1
, stringsAsFactors = F
, header = T)
str(mcsm_data)
###########################
# 4a: Filter/subset data: ONLY for LIGand analysis
# Lig plots < 10Ang
# Filter the lig plots for Dis_to_lig < 10Ang
###########################
# sanity checks
upos = unique(mcsm_data$Position)
# check range of distances
max(mcsm_data$Dis_lig_Ang)
min(mcsm_data$Dis_lig_Ang)
# Lig filtered: subset data to have only values less than 10 Ang
mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10)
rm(mcsm_data) #to avoid confusion
table(mcsm_data2$Dis_lig_Ang<10)
table(mcsm_data2$Dis_lig_Ang>10)
max(mcsm_data2$Dis_lig_Ang)
min(mcsm_data2$Dis_lig_Ang)
upos_f = unique(mcsm_data2$Position); upos_f
# colnames of df that you will need to subset the bigger df from
my_colnames = colnames(mcsm_data2)
#====================================
# subset bigger df i.e my_df to include only the columns in mcsm data2
my_df2 = my_df[my_colnames]
rm(my_df) #to avoid confusion
#====================================
# compare the two
head(mcsm_data2$Mutationinformation)
head(mcsm_data2$Position)
head(my_df2$Mutationinformation)
head(my_df2$Position)
# sort mcsm data by Mutationinformation
mcsm_data2_s = mcsm_data2[order(mcsm_data2$Mutationinformation),]
head(mcsm_data2_s$Mutationinformation)
head(mcsm_data2_s$Position)
# now compare: should be True, but is false....
# possibly due to rownames!?!
identical(mcsm_data2_s, my_df2)
# from library dplyr
setdiff(mcsm_data2_s, my_df2)
# from library compare
compare(mcsm_data2_s, my_df2) # seems rownames are the problem
#FIXME: automate this
# write files: checked using meld and files are indeed identical
#write.csv(mcsm_data2_s, "mcsm_data2_s.csv", row.names = F)
#write.csv(my_df2, "my_df2.csv", row.names = F)
##########################################################
# extract and write output file for SNP posn: all #
##########################################################
head(merged_df3$Position)
foo = merged_df3[order(merged_df3$Position),]
head(foo$Position)
snp_pos_unique = unique(foo$Position); snp_pos_unique
# sanity check:
table(snp_pos_unique == combined_df$Position)
#=====================
# write_output files
#=====================
outDir = "~/Data/pyrazinamide/input/processed/"
outFile1 = paste0(outDir, "snp_pos_unique.txt"); outFile1
print(paste0("Output file name and path will be:","", outFile1))
write.table(snp_pos_unique
, outFile1
, row.names = F
, col.names = F)
##############################################################
# extract and write output file for SNP posn: complete only #
##############################################################
head(merged_df3_comp$Position)
foo = merged_df3_comp[order(merged_df3_comp$Position),]
head(foo$Position)
snp_pos_unique = unique(foo$Position); snp_pos_unique
# outDir = "~/Data/pyrazinamide/input/processed/" # already set
outFile2 = paste0(outDir, "snp_pos_unique_comp.txt")
print(paste0("Output file name and path will be:", outFile2))
write.table(snp_pos_unique
, outFile2
, row.names = F
, col.names = F)
#============================== end of script