257 lines
6.6 KiB
R
257 lines
6.6 KiB
R
getwd()
|
|
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
|
|
getwd()
|
|
|
|
#########################################################
|
|
# 1: Installing and loading required packages #
|
|
#########################################################
|
|
|
|
source("Header_TT.R")
|
|
#source("barplot_colour_function.R")
|
|
|
|
##########################################################
|
|
# Checking: Entire data frame and for PS #
|
|
##########################################################
|
|
|
|
###########################
|
|
#2) Read file: combined one from the script
|
|
###########################
|
|
source("combining_two_df.R")
|
|
|
|
# df with NA:
|
|
# merged_df2
|
|
# merged_df3:
|
|
|
|
# df without NA:
|
|
# merged_df2_comp:
|
|
# merged_df3_comp:
|
|
|
|
######################
|
|
# You need to check it
|
|
# with the merged_df3
|
|
########################
|
|
|
|
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
# REASSIGNMENT
|
|
my_df = merged_df3
|
|
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
|
|
#clear variables
|
|
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
|
|
|
# should be true
|
|
identical(my_df$Position, my_df$position)
|
|
|
|
#################################
|
|
# Read file: normalised file
|
|
# output of step 4 mcsm_pipeline
|
|
#################################
|
|
|
|
|
|
inDir = "~/git/Data/pyrazinamide/input/processed/"
|
|
inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
|
|
|
|
mcsm_data <- read.csv(inFile
|
|
, row.names = 1
|
|
, stringsAsFactors = F
|
|
, header = T)
|
|
str(mcsm_data)
|
|
my_colnames = colnames(mcsm_data)
|
|
|
|
#====================================
|
|
# subset my_df to include only the columns in mcsm data
|
|
my_df2 = my_df[my_colnames]
|
|
#====================================
|
|
# compare the two
|
|
head(mcsm_data$Mutationinformation)
|
|
head(mcsm_data$Position)
|
|
|
|
head(my_df2$Mutationinformation)
|
|
head(my_df2$Position)
|
|
|
|
# sort mcsm data by Mutationinformation
|
|
mcsm_data_s = mcsm_data[order(mcsm_data$Mutationinformation),]
|
|
head(mcsm_data_s$Mutationinformation)
|
|
head(mcsm_data_s$Position)
|
|
|
|
# now compare: should be True, but is false....
|
|
# possibly due to rownames!?!
|
|
identical(mcsm_data_s, my_df2)
|
|
|
|
# from library dplyr
|
|
setdiff(mcsm_data_s, my_df2)
|
|
|
|
#from lib compare
|
|
compare(mcsm_data_s, my_df2) # seems rownames are the problem
|
|
|
|
# FIXME: automate this
|
|
# write files: checked using meld and files are indeed identical
|
|
#write.csv(mcsm_data_s, "mcsm_data_s.csv", row.names = F)
|
|
#write.csv(my_df2, "my_df2.csv", row.names = F)
|
|
|
|
|
|
#====================================================== end of section 1
|
|
|
|
|
|
|
|
##########################################################
|
|
# Checking: LIG(Filtered dataframe) #
|
|
##########################################################
|
|
|
|
# clear workspace
|
|
rm(list = ls())
|
|
|
|
###########################
|
|
#3) Read file: combined_lig from the script
|
|
###########################
|
|
source("combining_two_df_lig.R")
|
|
|
|
# df with NA:
|
|
# merged_df2 :
|
|
# merged_df3:
|
|
|
|
# df without NA:
|
|
# merged_df2_comp:
|
|
# merged_df3_comp:
|
|
|
|
######################
|
|
# You need to check it
|
|
# with the merged_df3
|
|
########################
|
|
|
|
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
# REASSIGNMENT
|
|
my_df = merged_df3
|
|
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
|
|
#clear variables
|
|
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
|
|
|
# should be true
|
|
identical(my_df$Position, my_df$position)
|
|
|
|
#################################
|
|
# Read file: normalised file
|
|
# output of step 4 mcsm_pipeline
|
|
#################################
|
|
|
|
inDir = "~/git/Data/pyrazinamide/input/processed/"
|
|
inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
|
|
|
|
mcsm_data <- read.csv(inFile
|
|
, row.names = 1
|
|
, stringsAsFactors = F
|
|
, header = T)
|
|
str(mcsm_data)
|
|
|
|
###########################
|
|
# 4a: Filter/subset data: ONLY for LIGand analysis
|
|
# Lig plots < 10Ang
|
|
# Filter the lig plots for Dis_to_lig < 10Ang
|
|
###########################
|
|
# sanity checks
|
|
upos = unique(mcsm_data$Position)
|
|
|
|
# check range of distances
|
|
max(mcsm_data$Dis_lig_Ang)
|
|
min(mcsm_data$Dis_lig_Ang)
|
|
|
|
# Lig filtered: subset data to have only values less than 10 Ang
|
|
mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10)
|
|
|
|
rm(mcsm_data) #to avoid confusion
|
|
|
|
table(mcsm_data2$Dis_lig_Ang<10)
|
|
table(mcsm_data2$Dis_lig_Ang>10)
|
|
|
|
max(mcsm_data2$Dis_lig_Ang)
|
|
min(mcsm_data2$Dis_lig_Ang)
|
|
|
|
upos_f = unique(mcsm_data2$Position); upos_f
|
|
|
|
# colnames of df that you will need to subset the bigger df from
|
|
my_colnames = colnames(mcsm_data2)
|
|
#====================================
|
|
# subset bigger df i.e my_df to include only the columns in mcsm data2
|
|
my_df2 = my_df[my_colnames]
|
|
|
|
rm(my_df) #to avoid confusion
|
|
#====================================
|
|
# compare the two
|
|
head(mcsm_data2$Mutationinformation)
|
|
head(mcsm_data2$Position)
|
|
|
|
head(my_df2$Mutationinformation)
|
|
head(my_df2$Position)
|
|
|
|
# sort mcsm data by Mutationinformation
|
|
mcsm_data2_s = mcsm_data2[order(mcsm_data2$Mutationinformation),]
|
|
head(mcsm_data2_s$Mutationinformation)
|
|
head(mcsm_data2_s$Position)
|
|
|
|
# now compare: should be True, but is false....
|
|
# possibly due to rownames!?!
|
|
identical(mcsm_data2_s, my_df2)
|
|
|
|
# from library dplyr
|
|
setdiff(mcsm_data2_s, my_df2)
|
|
|
|
# from library compare
|
|
compare(mcsm_data2_s, my_df2) # seems rownames are the problem
|
|
|
|
#FIXME: automate this
|
|
# write files: checked using meld and files are indeed identical
|
|
#write.csv(mcsm_data2_s, "mcsm_data2_s.csv", row.names = F)
|
|
#write.csv(my_df2, "my_df2.csv", row.names = F)
|
|
|
|
|
|
##########################################################
|
|
# extract and write output file for SNP posn: all #
|
|
##########################################################
|
|
|
|
head(merged_df3$Position)
|
|
|
|
foo = merged_df3[order(merged_df3$Position),]
|
|
head(foo$Position)
|
|
|
|
snp_pos_unique = unique(foo$Position); snp_pos_unique
|
|
|
|
# sanity check:
|
|
table(snp_pos_unique == combined_df$Position)
|
|
|
|
#=====================
|
|
# write_output files
|
|
#=====================
|
|
outDir = "~/Data/pyrazinamide/input/processed/"
|
|
|
|
|
|
outFile1 = paste0(outDir, "snp_pos_unique.txt"); outFile1
|
|
print(paste0("Output file name and path will be:","", outFile1))
|
|
|
|
write.table(snp_pos_unique
|
|
, outFile1
|
|
, row.names = F
|
|
, col.names = F)
|
|
|
|
##############################################################
|
|
# extract and write output file for SNP posn: complete only #
|
|
##############################################################
|
|
head(merged_df3_comp$Position)
|
|
|
|
foo = merged_df3_comp[order(merged_df3_comp$Position),]
|
|
head(foo$Position)
|
|
|
|
snp_pos_unique = unique(foo$Position); snp_pos_unique
|
|
|
|
# outDir = "~/Data/pyrazinamide/input/processed/" # already set
|
|
|
|
outFile2 = paste0(outDir, "snp_pos_unique_comp.txt")
|
|
print(paste0("Output file name and path will be:", outFile2))
|
|
|
|
write.table(snp_pos_unique
|
|
, outFile2
|
|
, row.names = F
|
|
, col.names = F)
|
|
#============================== end of script
|
|
|
|
|