LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/source_data_checks.R

getwd()
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
getwd()

#########################################################
# 1: Installing and loading required packages           #
#########################################################

source("Header_TT.R")
#source("barplot_colour_function.R")

##########################################################
#           Checking: Entire data frame and for PS      #
##########################################################

###########################
#2) Read file: combined one from the script
###########################
source("combining_two_df.R")

# df with NA:
# merged_df2
# merged_df3:

# df without NA:
# merged_df2_comp:
# merged_df3_comp:

######################
# You need to check it
# with the merged_df3
########################

#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
my_df = merged_df3
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

#clear variables
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)

# should be true
identical(my_df$Position, my_df$position)

#################################
# Read file: normalised file
# output of step 4 mcsm_pipeline
#################################


inDir = "~/git/Data/pyrazinamide/input/processed/"
inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile

mcsm_data <- read.csv(inFile
                  , row.names = 1
                  , stringsAsFactors = F
                  , header = T)
str(mcsm_data)
my_colnames  = colnames(mcsm_data)

#====================================
# subset my_df to include only the columns in mcsm data
my_df2 = my_df[my_colnames]
#====================================
# compare the two
head(mcsm_data$Mutationinformation)
head(mcsm_data$Position)

head(my_df2$Mutationinformation)
head(my_df2$Position)

# sort mcsm data by Mutationinformation
mcsm_data_s = mcsm_data[order(mcsm_data$Mutationinformation),]
head(mcsm_data_s$Mutationinformation)
head(mcsm_data_s$Position)

# now compare: should be True, but is false....
# possibly due to rownames!?!
identical(mcsm_data_s, my_df2)

# from library dplyr
setdiff(mcsm_data_s, my_df2)

#from lib compare
compare(mcsm_data_s, my_df2) # seems rownames are the problem

# FIXME: automate this
# write files: checked using meld and files are indeed identical
#write.csv(mcsm_data_s, "mcsm_data_s.csv", row.names = F)
#write.csv(my_df2, "my_df2.csv", row.names = F)


#====================================================== end of section 1


##########################################################
#             Checking: LIG(Filtered dataframe)          #
##########################################################

# clear workspace
rm(list = ls())

###########################
#3) Read file: combined_lig from the script
###########################
source("combining_two_df_lig.R")

# df with NA:
# merged_df2 :
# merged_df3:

# df without NA:
# merged_df2_comp:
# merged_df3_comp:

######################
# You need to check it
# with the merged_df3
########################

#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT
my_df = merged_df3
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

#clear variables
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)

# should be true
identical(my_df$Position, my_df$position)

#################################
# Read file: normalised file
# output of step 4 mcsm_pipeline
#################################

inDir = "~/git/Data/pyrazinamide/input/processed/"
inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile

mcsm_data <- read.csv(inFile
                      , row.names = 1
                      , stringsAsFactors = F
                      , header = T)
str(mcsm_data)

###########################
# 4a: Filter/subset data: ONLY for LIGand analysis
# Lig plots < 10Ang
# Filter the lig plots for Dis_to_lig < 10Ang
###########################
# sanity checks
upos = unique(mcsm_data$Position)

# check range of distances
max(mcsm_data$Dis_lig_Ang)
min(mcsm_data$Dis_lig_Ang)

# Lig filtered: subset data to have only values less than 10 Ang
mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10)

rm(mcsm_data) #to avoid confusion

table(mcsm_data2$Dis_lig_Ang<10)
table(mcsm_data2$Dis_lig_Ang>10)

max(mcsm_data2$Dis_lig_Ang)
min(mcsm_data2$Dis_lig_Ang)

upos_f = unique(mcsm_data2$Position); upos_f

# colnames of df that you will need to subset the bigger df from
my_colnames  = colnames(mcsm_data2)
#====================================
# subset bigger df i.e my_df to include only the columns in mcsm data2
my_df2 = my_df[my_colnames]

rm(my_df) #to avoid confusion
#====================================
# compare the two
head(mcsm_data2$Mutationinformation)
head(mcsm_data2$Position)

head(my_df2$Mutationinformation)
head(my_df2$Position)

# sort mcsm data by Mutationinformation
mcsm_data2_s = mcsm_data2[order(mcsm_data2$Mutationinformation),]
head(mcsm_data2_s$Mutationinformation)
head(mcsm_data2_s$Position)

# now compare: should be True, but is false....
# possibly due to rownames!?!
identical(mcsm_data2_s, my_df2)

# from library dplyr
setdiff(mcsm_data2_s, my_df2)

# from library compare
compare(mcsm_data2_s, my_df2) # seems rownames are the problem

#FIXME: automate this
# write files: checked using meld and files are indeed identical
#write.csv(mcsm_data2_s, "mcsm_data2_s.csv", row.names = F)
#write.csv(my_df2, "my_df2.csv", row.names = F)


##########################################################
#  extract and write output file for SNP posn: all     #
##########################################################

head(merged_df3$Position)

foo = merged_df3[order(merged_df3$Position),]
head(foo$Position)

snp_pos_unique = unique(foo$Position); snp_pos_unique

# sanity check:
table(snp_pos_unique == combined_df$Position)

#=====================
# write_output files
#=====================
outDir = "~/Data/pyrazinamide/input/processed/"


outFile1 = paste0(outDir, "snp_pos_unique.txt"); outFile1
print(paste0("Output file name and path will be:","", outFile1))

write.table(snp_pos_unique
            , outFile1
            , row.names = F
            , col.names = F)

##############################################################
#  extract and write output file for SNP posn: complete only #
##############################################################
head(merged_df3_comp$Position)

foo = merged_df3_comp[order(merged_df3_comp$Position),]
head(foo$Position)

snp_pos_unique = unique(foo$Position); snp_pos_unique

# outDir = "~/Data/pyrazinamide/input/processed/" # already set

outFile2 = paste0(outDir, "snp_pos_unique_comp.txt")
print(paste0("Output file name and path will be:", outFile2))

write.table(snp_pos_unique
            , outFile2
            , row.names = F
            , col.names = F)
#============================== end of script