renamed files with shorter names and corrected paths for input and output files

This commit is contained in:
Tanushree Tunstall 2020-01-13 10:39:49 +00:00
parent 4be0de97d7
commit 61fcd14b17
8 changed files with 244 additions and 164 deletions

View file

@ -1,6 +1,9 @@
# run step0-step3a for mcsm pipeline #!/bin/bash
# run all bash scripts for mcsm
#./step0_check_duplicate_SNPs.sh #./step0_check_duplicate_SNPs.sh
#./step1_mCSMLig_curl_submit_store_outputurl.sh #./step1_lig_output_urls.sh
./step2_mCSM_LIG_batch_outputurls_results.sh ./step2_lig_results.sh
./step3a_mCSM_LIG_regex_output_formatting.sh ./step3a_results_format_interim.sh

View file

@ -24,7 +24,7 @@ infile_mut="/pnca_mis_SNPs_v2_unique.csv"
infile_struc="/complex1_no_water.pdb" infile_struc="/complex1_no_water.pdb"
outpath="${inpath}${processed_path}" outpath="${inpath}${processed_path}"
outfile="/mCSM_lig_complex1_result_url.txt" outfile="/complex1_result_url.txt"
# create valid input and output filenames # create valid input and output filenames
#filename="${HOME}/git/Data/pyrazinamide/input/processed/pnca_mis_SNPs_v2_unique.csv" #filename="${HOME}/git/Data/pyrazinamide/input/processed/pnca_mis_SNPs_v2_unique.csv"
@ -82,7 +82,7 @@ echo -e "${mutation} : processing entry ${COUNT}/$(wc -l < ${filename})..."
# create output file with the added number of muts from file # create output file with the added number of muts from file
# after much thought, bad idea as less generic! # after much thought, bad idea as less generic!
#echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_mCSM_lig_complex1_result_url.txt #echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_complex1_result_url.txt
echo -e "${host}${result_url}" >> ${outfilename} echo -e "${host}${result_url}" >> ${outfilename}
#echo -n '.' #echo -n '.'
done < "${filename}" done < "${filename}"

View file

@ -35,10 +35,10 @@
# specify variables for input and output paths and filenames # specify variables for input and output paths and filenames
inpath="${HOME}/git/Data/pyrazinamide/input" inpath="${HOME}/git/Data/pyrazinamide/input"
processed_path="/processed" processed_path="/processed"
infile="/mCSM_lig_complex1_result_url.txt" infile="/complex1_result_url.txt"
outpath="${inpath}${processed_path}" outpath="${inpath}${processed_path}"
outfile="/mCSM_lig_complex1_output_MASTER.txt" outfile="/complex1_output_MASTER.txt"
# create valid input and output filenames # create valid input and output filenames
filename="${inpath}${processed_path}${infile}" filename="${inpath}${processed_path}${infile}"

View file

@ -27,12 +27,11 @@ inpath="${HOME}/git/Data/pyrazinamide/input"
processed_path="/processed" processed_path="/processed"
# Create input file: copy and rename output file of step2 # Create input file: copy and rename output file of step2
oldfile="${inpath}${processed_path}/mCSM_lig_complex1_output_MASTER.txt" oldfile="${inpath}${processed_path}/complex1_output_MASTER.txt"
newfile="${inpath}${processed_path}/mCSM_lig_complex1_output_processed.txt" newfile="${inpath}${processed_path}/complex1_output_processed.txt"
cp $oldfile $newfile cp $oldfile $newfile
#infile="../Results/336_mCSM_lig_complex1_output_processed.txt" infile="/complex1_output_processed.txt"
infile="/mCSM_lig_complex1_output_processed.txt"
filename="${inpath}${processed_path}${infile}" filename="${inpath}${processed_path}${infile}"
echo Input filename is : ${filename} echo Input filename is : ${filename}

View file

@ -1,29 +0,0 @@
#!/usr/bin/python
import pandas as pd
from collections import defaultdict
#file = r'../Results/322_mCSM_lig_complex1_output_processed.txt'
outCols=[
'PredictedAffinityChange',
'Mutationinformation',
'Wild-type',
'Position',
'Mutant-type',
'Chain',
'LigandID',
'Distancetoligand',
'DUETstabilitychange'
]
lines = [line.rstrip('\n') for line in open('../Results/336_mCSM_lig_complex1_output_processed.txt')]
outputs = defaultdict(list)
for item in lines:
col, val = item.split(':')
outputs[col].append(val)
dfOut=pd.DataFrame(outputs)
pd.DataFrame.to_csv(dfOut,'../Results/336_complex1_formatted_results.csv', columns=outCols)

View file

@ -0,0 +1,61 @@
#!/usr/bin/python
###################
# load libraries
import os, sys
import pandas as pd
from collections import defaultdict
####################
#********************************************************************
# TASK: Formatting results with nice colnames
# step3a processed the mcsm results to remove all newlines and
# brought data in a format where the delimiter ":" splits
# data into a convenient format of "colname": "value".
# this script formats the data and outputs a df with each row
# as a mutation and its corresponding mcsm_values
# Requirements:
# input: output of step3a, file containing "..._output_processed.txt"
# path: "Data/<drug>/input/processed/<filename>"
# output: formatted .csv file
# path: "Data/<drug>/input/processed/<filename>"
#***********************************************************************
# specify variables for input and output paths and filenames
homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
basedir = "/git/Data/pyrazinamide/input"
inpath = "/processed"
in_filename = "/complex1_output_processed.txt"
infile = homedir + basedir + inpath + in_filename
print("Input file is:", infile)
outpath = "/processed"
out_filename = "/complex1_formatted_results.csv"
outfile = homedir + basedir + outpath + out_filename
print("Output file is:", outfile)
# end of variable assignment for input and output files
outCols=[
'PredictedAffinityChange',
'Mutationinformation',
'Wild-type',
'Position',
'Mutant-type',
'Chain',
'LigandID',
'Distancetoligand',
'DUETstabilitychange'
]
lines = [line.rstrip('\n') for line in open(infile)]
outputs = defaultdict(list)
for item in lines:
col, val = item.split(':')
outputs[col].append(val)
dfOut=pd.DataFrame(outputs)
pd.DataFrame.to_csv(dfOut, outfile, columns=outCols)

View file

@ -1,22 +1,42 @@
getwd() getwd()
#setwd("~/Documents/git/LSHTM_analysis/mcsm_complex1/Results") # work #setwd("~/git/LSHTM_analysis/mcsm_complex1/Results")
setwd("~/git/LSHTM_analysis/mcsm_complex1/Results") # thinkpad
#setwd("/Users/tanu/git/LSHTM_analysis/mcsm_complex1/Results") # mac
getwd() getwd()
#======================================================= #=======================================================
# TASK: read formatted_results_df.csv to complete
# missing info, adding DUET categories, assigning
# meaningful colnames, etc.
# Requirements:
# input: output of step3b, python processing,
# path: Data/<drug>/input/processed/<filename>"
# output: NO output as the next scripts refers to this
# for yet more processing
#=======================================================
# specify variables for input and output paths and filenames
homedir = "~"
basedir = "/git/Data/pyrazinamide/input"
inpath = "/processed"
in_filename = "/complex1_formatted_results.csv"
infile = paste0(homedir, basedir, inpath, in_filename)
print(paste0("Input file is:", infile))
#======================================================
#TASK: To tidy the columns so you can generate figures #TASK: To tidy the columns so you can generate figures
#======================================================= #=======================================================
#################### ####################
#### read file #####: this will be the output from python script (csv file) #### read file #####: this will be the output from python script (csv file)
#################### ####################
data = read.csv("336_complex1_formatted_results.csv" data = read.csv(infile
, header = T , header = T
, stringsAsFactors = FALSE) , stringsAsFactors = FALSE)
dim(data) dim(data)
#335, 10
str(data) str(data)
# clear variables
rm(homedir, basedir, inpath, in_filename, infile)
########################### ###########################
##### Data processing ##### ##### Data processing #####
########################### ###########################
@ -31,34 +51,30 @@ data$Mutationinformation = paste0(data$Wild.type, data$Position, data$Mutant.typ
head(data$Mutationinformation) head(data$Mutationinformation)
tail(data$Mutationinformation) tail(data$Mutationinformation)
#write.csv(data, 'test.csv') #write.csv(data, 'test.csv')
########################################## ##########################################
# Remove duplicate SNPs as a sanity check # Remove duplicate SNPs as a sanity check
########################################## ##########################################
# very important # very important
table(duplicated(data$Mutationinformation)) table(duplicated(data$Mutationinformation))
#FALSE
#335
# extract duplicated entries # extract duplicated entries
dups = data[duplicated(data$Mutationinformation),] #0 dups = data[duplicated(data$Mutationinformation),] #0
# No of dups should match with the no. of TRUE in the above table # No of dups should match with the no. of TRUE in the above table
#u_dups = unique(dups$Mutationinformation) #10 #u_dups = unique(dups$Mutationinformation) #10
sum( table(dups$Mutationinformation) ) #13 sum( table(dups$Mutationinformation) )
rm(dups)
#*************************************************************** #***************************************************************
# select non-duplicated SNPs and create a new df # select non-duplicated SNPs and create a new df
df = data[!duplicated(data$Mutationinformation),] #309, 10 df = data[!duplicated(data$Mutationinformation),]
#*************************************************************** #***************************************************************
# sanity check # sanity check
u = unique(df$Mutationinformation) u = unique(df$Mutationinformation)
u2 = unique(data$Mutationinformation) u2 = unique(data$Mutationinformation)
table(u%in%u2) table(u%in%u2)
#TRUE
#309 # should all be 1
#should all be 1, hence 309 1's
sum(table(df$Mutationinformation) == 1) sum(table(df$Mutationinformation) == 1)
# sort df by Position # sort df by Position
@ -66,6 +82,7 @@ sum(table(df$Mutationinformation) == 1)
#foo <- df[order(df$Position),] #foo <- df[order(df$Position),]
#df <- df[order(df$Position),] #df <- df[order(df$Position),]
# clear variables
rm(u, u2, dups) rm(u, u2, dups)
#################### ####################
@ -76,7 +93,7 @@ rm(u, u2, dups)
#STEP 1 #STEP 1
#======== #========
# make a copy of the PredictedAffinityColumn and call it Lig_outcome # make a copy of the PredictedAffinityColumn and call it Lig_outcome
df$Lig_outcome = df$PredictedAffinityChange #335, 11 df$Lig_outcome = df$PredictedAffinityChange
#make Predicted...column numeric and outcome column categorical #make Predicted...column numeric and outcome column categorical
head(df$PredictedAffinityChange) head(df$PredictedAffinityChange)
@ -89,8 +106,10 @@ head(df$PredictedAffinityChange)
# should be numeric, check and if not make it numeric # should be numeric, check and if not make it numeric
is.numeric( df$PredictedAffinityChange ) is.numeric( df$PredictedAffinityChange )
# change to numeric # change to numeric
df$PredictedAffinityChange = as.numeric(df$PredictedAffinityChange) df$PredictedAffinityChange = as.numeric(df$PredictedAffinityChange)
# should be TRUE # should be TRUE
is.numeric( df$PredictedAffinityChange ) is.numeric( df$PredictedAffinityChange )
@ -109,10 +128,13 @@ df$Lig_outcome = gsub("^.*-"
df$Lig_outcome) df$Lig_outcome)
# sanity checks # sanity checks
head(df$Lig_outcome) head(df$Lig_outcome)
# should be factor, check and if not change it to factor # should be factor, check and if not change it to factor
is.factor(df$Lig_outcome) is.factor(df$Lig_outcome)
# change to factor # change to factor
df$Lig_outcome = as.factor(df$Lig_outcome) df$Lig_outcome = as.factor(df$Lig_outcome)
# should be TRUE # should be TRUE
is.factor(df$Lig_outcome) is.factor(df$Lig_outcome)
@ -126,10 +148,13 @@ df$Distancetoligand = gsub("&Aring;"
, df$Distancetoligand) , df$Distancetoligand)
# sanity checks # sanity checks
head(df$Distancetoligand) head(df$Distancetoligand)
# should be numeric, check if not change it to numeric # should be numeric, check if not change it to numeric
is.numeric(df$Distancetoligand) is.numeric(df$Distancetoligand)
# change to numeric # change to numeric
df$Distancetoligand = as.numeric(df$Distancetoligand) df$Distancetoligand = as.numeric(df$Distancetoligand)
# should be TRUE # should be TRUE
is.numeric(df$Distancetoligand) is.numeric(df$Distancetoligand)
@ -148,10 +173,13 @@ df$DUETstabilitychange = gsub("Kcal/mol"
, df$DUETstabilitychange) , df$DUETstabilitychange)
# sanity checks # sanity checks
head(df$DUETstabilitychange) head(df$DUETstabilitychange)
# should be numeric, check if not change it to numeric # should be numeric, check if not change it to numeric
is.numeric(df$DUETstabilitychange) is.numeric(df$DUETstabilitychange)
# change to numeric # change to numeric
df$DUETstabilitychange = as.numeric(df$DUETstabilitychange) df$DUETstabilitychange = as.numeric(df$DUETstabilitychange)
# should be TRUE # should be TRUE
is.numeric(df$DUETstabilitychange) is.numeric(df$DUETstabilitychange)
@ -166,22 +194,17 @@ colnames(df)[n]
# create yet another extra column: classification of DUET stability only # create yet another extra column: classification of DUET stability only
df$DUET_outcome = ifelse(df$DUETStability_Kcalpermol >=0 df$DUET_outcome = ifelse(df$DUETStability_Kcalpermol >=0
, "Stabilizing" , "Stabilizing"
, "Destabilizing") #335, 12 , "Destabilizing") # spelling to be consistent with mcsm
table(df$Lig_outcome) table(df$Lig_outcome)
#Destabilizing Stabilizing
#281 54
table(df$DUET_outcome) table(df$DUET_outcome)
#Destabilizing Stabilizing
#288 47
#============================== #==============================
#FIXME #FIXME
#Insert a venn diagram #Insert a venn diagram
#================================ #================================
#======== #========
#STEP 6 #STEP 6
#======== #========
@ -199,7 +222,7 @@ colnames(df[mut])
#STEP 7 #STEP 7
#======== #========
# create an extra column: maybe useful for some plots # create an extra column: maybe useful for some plots
df$WildPos = paste0(df$Wild_type, df$Position) #335, 13 df$WildPos = paste0(df$Wild_type, df$Position)
# clear variables # clear variables
rm(n, wt, mut) rm(n, wt, mut)

View file

@ -1,7 +1,8 @@
getwd() ##################
#setwd("~/Documents/git/LSHTM_analysis/mcsm_complex1/Results") # work # load libraries
setwd("~/git/LSHTM_analysis/mcsm_complex1/Results") # thinkpad library(compare)
#setwd("/Users/tanu/git/LSHTM_analysis/mcsm_complex1/Results") # mac ##################
getwd() getwd()
#======================================================= #=======================================================
@ -9,17 +10,38 @@ getwd()
# of DUET stability scores # of DUET stability scores
# of Pred affinity # of Pred affinity
# compare scaling methods with plots # compare scaling methods with plots
# Requirements:
# input: R script, step3c_results_cleaning.R
# path: Data/<drug>/input/processed/<filename>"
# output: NO output as the next scripts refers to this
# for yet more processing
# output normalised file # output normalised file
#======================================================= #=======================================================
# specify variables for input and output paths and filenames
homedir = "~"
currdir = "/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/mcsm"
in_filename = "/step3c_results_cleaning.R"
infile = paste0(homedir, currdir, in_filename)
print(paste0("Input file is:", infile))
# output file
basedir = "/git/Data/pyrazinamide/input"
outpath = "/processed"
out_filename = "/mcsm_complex1_normalised.csv"
outfile = paste0(homedir, basedir, outpath, out_filename)
print(paste0("Output file is:", outfile))
#################### ####################
#### read file #####: this will be the output of my R script that cleans the data columns #### read file #####: this will be the output of my R script that cleans the data columns
#################### ####################
source("../Scripts/step3c_data_cleaning.R") source(infile)
##This will outut two dataframes:
##data: unclean data: 335, 10 #This will outut two dataframes:
##df : cleaned df 335, 13 # data: unclean data: 10 cols
## you can remove data if you want as you will not need it # df : cleaned df: 13 cols
# you can remove data if you want as you will not need it
rm(data) rm(data)
colnames(df) colnames(df)
@ -49,26 +71,25 @@ mean(df[,n])
#-0.9526746 #-0.9526746
tapply(df[,n], df[,group], mean) tapply(df[,n], df[,group], mean)
#Destabilizing Stabilizing
#-1.2112100 0.3926667
#=========================== #===========================
# Same as above: in 2 steps # Same as above: in 2 steps
#=========================== #===========================
# find range of your data # find range of your data
my_min = min(df[,n]); my_min #-3.948 my_min = min(df[,n]); my_min #
my_max = max(df[,n]); my_max #2.23 my_max = max(df[,n]); my_max #
#=============================================== #===============================================
# WITHIN GROUP rescaling 2: method "ratio" # WITHIN GROUP rescaling 2: method "ratio"
# create column to store the rescaled values # create column to store the rescaled values
# Rescaling separately (Less dangerous) # Rescaling separately (Less dangerous)
# =====> chosen one:as Nick prefers # =====> chosen one: preserves sign
#=============================================== #===============================================
df$ratioPredAff = ifelse(df[,n] < 0 df$ratioPredAff = ifelse(df[,n] < 0
, df[,n]/abs(my_min) , df[,n]/abs(my_min)
, df[,n]/my_max , df[,n]/my_max
)#335 14 )# 14 cols
# sanity checks # sanity checks
head(df$ratioPredAff) head(df$ratioPredAff)
tail(df$ratioPredAff) tail(df$ratioPredAff)
@ -76,19 +97,13 @@ tail(df$ratioPredAff)
min(df$ratioPredAff); max(df$ratioPredAff) min(df$ratioPredAff); max(df$ratioPredAff)
tapply(df$ratioPredAff, df$Lig_outcome, min) tapply(df$ratioPredAff, df$Lig_outcome, min)
#Destabilizing Stabilizing
#-1.000000000 0.005381166
tapply(df$ratioPredAff, df$Lig_outcome, max) tapply(df$ratioPredAff, df$Lig_outcome, max)
#Destabilizing Stabilizing
#-0.001266464 1.000000000
#should be the same as below (281 and 54) # should be the same as below
sum(df$ratioPredAff < 0); sum(df$ratioPredAff > 0) sum(df$ratioPredAff < 0); sum(df$ratioPredAff > 0)
table(df$Lig_outcome) table(df$Lig_outcome)
#Destabilizing Stabilizing
#281 54
#=============================================== #===============================================
# Hist and density plots to compare the rescaling # Hist and density plots to compare the rescaling
@ -140,7 +155,7 @@ rm(my_min, my_max, my_title, n, group)
#=================== #===================
# 3b: DUET stability # 3b: DUET stability
#=================== #===================
dim(df) #335, 14 dim(df) # 14 cols
n = which(colnames(df) == "DUETStability_Kcalpermol"); n # 10 n = which(colnames(df) == "DUETStability_Kcalpermol"); n # 10
group = which(colnames(df) == "DUET_outcome"); group #12 group = which(colnames(df) == "DUET_outcome"); group #12
@ -160,26 +175,23 @@ tail(df[,n]) #positives
# sanity checks # sanity checks
mean(df[,n]) mean(df[,n])
#[1] -1.173316
tapply(df[,n], df[,group], mean) tapply(df[,n], df[,group], mean)
#Destabilizing Stabilizing
#-1.4297257 0.3978723
#=============================================== #===============================================
# WITHIN GROUP rescaling 2: method "ratio" # WITHIN GROUP rescaling 2: method "ratio"
# create column to store the rescaled values # create column to store the rescaled values
# Rescaling separately (Less dangerous) # Rescaling separately (Less dangerous)
# =====> chosen one:as Nick prefers # =====> chosen one: preserves sign
#=============================================== #===============================================
# find range of your data # find range of your data
my_min = min(df[,n]); my_min #-3.87 my_min = min(df[,n]); my_min
my_max = max(df[,n]); my_max #1.689 my_max = max(df[,n]); my_max
df$ratioDUET = ifelse(df[,n] < 0 df$ratioDUET = ifelse(df[,n] < 0
, df[,n]/abs(my_min) , df[,n]/abs(my_min)
, df[,n]/my_max , df[,n]/my_max
) #335, 15 ) # 15 cols
# sanity check # sanity check
head(df$ratioDUET) head(df$ratioDUET)
tail(df$ratioDUET) tail(df$ratioDUET)
@ -188,26 +200,19 @@ min(df$ratioDUET); max(df$ratioDUET)
# sanity checks # sanity checks
tapply(df$ratioDUET, df$DUET_outcome, min) tapply(df$ratioDUET, df$DUET_outcome, min)
#Destabilizing Stabilizing
#-1.00000000 0.01065719
tapply(df$ratioDUET, df$DUET_outcome, max) tapply(df$ratioDUET, df$DUET_outcome, max)
#Destabilizing Stabilizing
#-0.003875969 1.000000000
# should be the same as below (267 and 42) # should be the same as below (267 and 42)
sum(df$ratioDUET < 0); sum(df$ratioDUET > 0) sum(df$ratioDUET < 0); sum(df$ratioDUET > 0)
table(df$DUET_outcome) table(df$DUET_outcome)
#Destabilizing Stabilizing
#288 47
#=============================================== #===============================================
# Hist and density plots to compare the rescaling # Hist and density plots to compare the rescaling
# methods: Base R # methods: Base R
#=============================================== #===============================================
# uncomment as necessary # uncomment as necessary
my_title = "DUET_stability" my_title = "DUET_stability"
#my_title = colnames(df[n]) #my_title = colnames(df[n])
@ -246,7 +251,25 @@ mtext(text = my_title
, line = 0 , line = 0
, outer = TRUE) , outer = TRUE)
# reorder by column name
#data <- data[c("A", "B", "C")]
colnames(df)
df2 = df[c("X", "Mutationinformation", "WildPos", "Position"
, "Wild_type", "Mutant_type"
, "DUETStability_Kcalpermol", "DUET_outcome"
, "Dis_lig_Ang", "PredAffLog", "Lig_outcome"
, "ratioDUET", "ratioPredAff"
, "LigandID","Chain")]
# sanity check
# should be True
#compare(df, df2, allowAll = T)
compare(df, df2, ignoreColOrder = T)
#TRUE
#reordered columns
#=================== #===================
# write output as csv file # write output as csv file
#=================== #===================
write.csv(df, "../Data/mcsm_complex1_normalised.csv", row.names = FALSE) #335, 15 #write.csv(df, "../Data/mcsm_complex1_normalised.csv", row.names = FALSE)
write.csv(df2, outfile, row.names = FALSE)