import commit
This commit is contained in:
commit
bccfe68192
39 changed files with 6837 additions and 0 deletions
257
mcsm_analysis/pyrazinamide/scripts/source_data_checks.R
Normal file
257
mcsm_analysis/pyrazinamide/scripts/source_data_checks.R
Normal file
|
@ -0,0 +1,257 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
|
||||
getwd()
|
||||
|
||||
#########################################################
|
||||
# 1: Installing and loading required packages #
|
||||
#########################################################
|
||||
|
||||
source("Header_TT.R")
|
||||
#source("barplot_colour_function.R")
|
||||
|
||||
##########################################################
|
||||
# Checking: Entire data frame and for PS #
|
||||
##########################################################
|
||||
|
||||
###########################
|
||||
#2) Read file: combined one from the script
|
||||
###########################
|
||||
source("combining_two_df.R")
|
||||
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3:
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp:
|
||||
# merged_df3_comp:
|
||||
|
||||
######################
|
||||
# You need to check it
|
||||
# with the merged_df3
|
||||
########################
|
||||
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df3
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
||||
#clear variables
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# should be true
|
||||
identical(my_df$Position, my_df$position)
|
||||
|
||||
#################################
|
||||
# Read file: normalised file
|
||||
# output of step 4 mcsm_pipeline
|
||||
#################################
|
||||
|
||||
|
||||
inDir = "~/git/Data/pyrazinamide/input/processed/"
|
||||
inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
|
||||
|
||||
mcsm_data <- read.csv(inFile
|
||||
, row.names = 1
|
||||
, stringsAsFactors = F
|
||||
, header = T)
|
||||
str(mcsm_data)
|
||||
my_colnames = colnames(mcsm_data)
|
||||
|
||||
#====================================
|
||||
# subset my_df to include only the columns in mcsm data
|
||||
my_df2 = my_df[my_colnames]
|
||||
#====================================
|
||||
# compare the two
|
||||
head(mcsm_data$Mutationinformation)
|
||||
head(mcsm_data$Position)
|
||||
|
||||
head(my_df2$Mutationinformation)
|
||||
head(my_df2$Position)
|
||||
|
||||
# sort mcsm data by Mutationinformation
|
||||
mcsm_data_s = mcsm_data[order(mcsm_data$Mutationinformation),]
|
||||
head(mcsm_data_s$Mutationinformation)
|
||||
head(mcsm_data_s$Position)
|
||||
|
||||
# now compare: should be True, but is false....
|
||||
# possibly due to rownames!?!
|
||||
identical(mcsm_data_s, my_df2)
|
||||
|
||||
# from library dplyr
|
||||
setdiff(mcsm_data_s, my_df2)
|
||||
|
||||
#from lib compare
|
||||
compare(mcsm_data_s, my_df2) # seems rownames are the problem
|
||||
|
||||
# FIXME: automate this
|
||||
# write files: checked using meld and files are indeed identical
|
||||
#write.csv(mcsm_data_s, "mcsm_data_s.csv", row.names = F)
|
||||
#write.csv(my_df2, "my_df2.csv", row.names = F)
|
||||
|
||||
|
||||
#====================================================== end of section 1
|
||||
|
||||
|
||||
|
||||
##########################################################
|
||||
# Checking: LIG(Filtered dataframe) #
|
||||
##########################################################
|
||||
|
||||
# clear workspace
|
||||
rm(list = ls())
|
||||
|
||||
###########################
|
||||
#3) Read file: combined_lig from the script
|
||||
###########################
|
||||
source("combining_two_df_lig.R")
|
||||
|
||||
# df with NA:
|
||||
# merged_df2 :
|
||||
# merged_df3:
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp:
|
||||
# merged_df3_comp:
|
||||
|
||||
######################
|
||||
# You need to check it
|
||||
# with the merged_df3
|
||||
########################
|
||||
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df3
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
||||
#clear variables
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# should be true
|
||||
identical(my_df$Position, my_df$position)
|
||||
|
||||
#################################
|
||||
# Read file: normalised file
|
||||
# output of step 4 mcsm_pipeline
|
||||
#################################
|
||||
|
||||
inDir = "~/git/Data/pyrazinamide/input/processed/"
|
||||
inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
|
||||
|
||||
mcsm_data <- read.csv(inFile
|
||||
, row.names = 1
|
||||
, stringsAsFactors = F
|
||||
, header = T)
|
||||
str(mcsm_data)
|
||||
|
||||
###########################
|
||||
# 4a: Filter/subset data: ONLY for LIGand analysis
|
||||
# Lig plots < 10Ang
|
||||
# Filter the lig plots for Dis_to_lig < 10Ang
|
||||
###########################
|
||||
# sanity checks
|
||||
upos = unique(mcsm_data$Position)
|
||||
|
||||
# check range of distances
|
||||
max(mcsm_data$Dis_lig_Ang)
|
||||
min(mcsm_data$Dis_lig_Ang)
|
||||
|
||||
# Lig filtered: subset data to have only values less than 10 Ang
|
||||
mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10)
|
||||
|
||||
rm(mcsm_data) #to avoid confusion
|
||||
|
||||
table(mcsm_data2$Dis_lig_Ang<10)
|
||||
table(mcsm_data2$Dis_lig_Ang>10)
|
||||
|
||||
max(mcsm_data2$Dis_lig_Ang)
|
||||
min(mcsm_data2$Dis_lig_Ang)
|
||||
|
||||
upos_f = unique(mcsm_data2$Position); upos_f
|
||||
|
||||
# colnames of df that you will need to subset the bigger df from
|
||||
my_colnames = colnames(mcsm_data2)
|
||||
#====================================
|
||||
# subset bigger df i.e my_df to include only the columns in mcsm data2
|
||||
my_df2 = my_df[my_colnames]
|
||||
|
||||
rm(my_df) #to avoid confusion
|
||||
#====================================
|
||||
# compare the two
|
||||
head(mcsm_data2$Mutationinformation)
|
||||
head(mcsm_data2$Position)
|
||||
|
||||
head(my_df2$Mutationinformation)
|
||||
head(my_df2$Position)
|
||||
|
||||
# sort mcsm data by Mutationinformation
|
||||
mcsm_data2_s = mcsm_data2[order(mcsm_data2$Mutationinformation),]
|
||||
head(mcsm_data2_s$Mutationinformation)
|
||||
head(mcsm_data2_s$Position)
|
||||
|
||||
# now compare: should be True, but is false....
|
||||
# possibly due to rownames!?!
|
||||
identical(mcsm_data2_s, my_df2)
|
||||
|
||||
# from library dplyr
|
||||
setdiff(mcsm_data2_s, my_df2)
|
||||
|
||||
# from library compare
|
||||
compare(mcsm_data2_s, my_df2) # seems rownames are the problem
|
||||
|
||||
#FIXME: automate this
|
||||
# write files: checked using meld and files are indeed identical
|
||||
#write.csv(mcsm_data2_s, "mcsm_data2_s.csv", row.names = F)
|
||||
#write.csv(my_df2, "my_df2.csv", row.names = F)
|
||||
|
||||
|
||||
##########################################################
|
||||
# extract and write output file for SNP posn: all #
|
||||
##########################################################
|
||||
|
||||
head(merged_df3$Position)
|
||||
|
||||
foo = merged_df3[order(merged_df3$Position),]
|
||||
head(foo$Position)
|
||||
|
||||
snp_pos_unique = unique(foo$Position); snp_pos_unique
|
||||
|
||||
# sanity check:
|
||||
table(snp_pos_unique == combined_df$Position)
|
||||
|
||||
#=====================
|
||||
# write_output files
|
||||
#=====================
|
||||
outDir = "~/Data/pyrazinamide/input/processed/"
|
||||
|
||||
|
||||
outFile1 = paste0(outDir, "snp_pos_unique.txt"); outFile1
|
||||
print(paste0("Output file name and path will be:","", outFile1))
|
||||
|
||||
write.table(snp_pos_unique
|
||||
, outFile1
|
||||
, row.names = F
|
||||
, col.names = F)
|
||||
|
||||
##############################################################
|
||||
# extract and write output file for SNP posn: complete only #
|
||||
##############################################################
|
||||
head(merged_df3_comp$Position)
|
||||
|
||||
foo = merged_df3_comp[order(merged_df3_comp$Position),]
|
||||
head(foo$Position)
|
||||
|
||||
snp_pos_unique = unique(foo$Position); snp_pos_unique
|
||||
|
||||
# outDir = "~/Data/pyrazinamide/input/processed/" # already set
|
||||
|
||||
outFile2 = paste0(outDir, "snp_pos_unique_comp.txt")
|
||||
print(paste0("Output file name and path will be:", outFile2))
|
||||
|
||||
write.table(snp_pos_unique
|
||||
, outFile2
|
||||
, row.names = F
|
||||
, col.names = F)
|
||||
#============================== end of script
|
||||
|
||||
|
Loading…
Add table
Add a link
Reference in a new issue