renamed files with shorter names and corrected paths for input and output files

2020-01-13 10:39:49 +00:00 · 2020-01-13 10:39:49 +00:00 · 61fcd14b17
commit 61fcd14b17
parent 4be0de97d7
8 changed files with 244 additions and 164 deletions
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/run.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/run.sh
@ -1,6 +1,9 @@
-# run step0-step3a for mcsm pipeline
+#!/bin/bash
+
+# run all bash scripts for mcsm

 #./step0_check_duplicate_SNPs.sh
-#./step1_mCSMLig_curl_submit_store_outputurl.sh
-./step2_mCSM_LIG_batch_outputurls_results.sh
-./step3a_mCSM_LIG_regex_output_formatting.sh
+#./step1_lig_output_urls.sh
+./step2_lig_results.sh
+./step3a_results_format_interim.sh
+
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_mCSMLig_curl_submit_store_outputurl.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_mCSMLig_curl_submit_store_outputurl.sh
@ -24,7 +24,7 @@ infile_mut="/pnca_mis_SNPs_v2_unique.csv"
 infile_struc="/complex1_no_water.pdb"

 outpath="${inpath}${processed_path}"
-outfile="/mCSM_lig_complex1_result_url.txt"
+outfile="/complex1_result_url.txt"

 # create valid input and output filenames
 #filename="${HOME}/git/Data/pyrazinamide/input/processed/pnca_mis_SNPs_v2_unique.csv"
@ -82,7 +82,7 @@ echo -e "${mutation} : processing entry ${COUNT}/$(wc -l < ${filename})..."

 # create output file with the added number of muts from file
 # after much thought, bad idea as less generic!
-#echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_mCSM_lig_complex1_result_url.txt
+#echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_complex1_result_url.txt
 echo -e "${host}${result_url}" >> ${outfilename}
 #echo -n '.'
 done < "${filename}"
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_mCSM_LIG_batch_outputurls_results.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_mCSM_LIG_batch_outputurls_results.sh
@ -35,10 +35,10 @@
 # specify variables for input and output paths and filenames
 inpath="${HOME}/git/Data/pyrazinamide/input"
 processed_path="/processed"
-infile="/mCSM_lig_complex1_result_url.txt"
+infile="/complex1_result_url.txt"

 outpath="${inpath}${processed_path}"
-outfile="/mCSM_lig_complex1_output_MASTER.txt"
+outfile="/complex1_output_MASTER.txt"

 # create valid input and output filenames
 filename="${inpath}${processed_path}${infile}"
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_mCSM_LIG_regex_output_formatting.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_mCSM_LIG_regex_output_formatting.sh
@ -27,12 +27,11 @@ inpath="${HOME}/git/Data/pyrazinamide/input"
 processed_path="/processed"

 # Create input file: copy and rename output file of step2
-oldfile="${inpath}${processed_path}/mCSM_lig_complex1_output_MASTER.txt"
-newfile="${inpath}${processed_path}/mCSM_lig_complex1_output_processed.txt"
+oldfile="${inpath}${processed_path}/complex1_output_MASTER.txt"
+newfile="${inpath}${processed_path}/complex1_output_processed.txt"
 cp $oldfile $newfile

-#infile="../Results/336_mCSM_lig_complex1_output_processed.txt"
-infile="/mCSM_lig_complex1_output_processed.txt"
+infile="/complex1_output_processed.txt"
 filename="${inpath}${processed_path}${infile}"

 echo Input filename is : ${filename}
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_format_results.py
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_format_results.py
@ -1,29 +0,0 @@
-#!/usr/bin/python
-import pandas as pd
-from collections import defaultdict
-
-#file = r'../Results/322_mCSM_lig_complex1_output_processed.txt'
-
-outCols=[
-        'PredictedAffinityChange',
-        'Mutationinformation',
-        'Wild-type',
-        'Position',
-        'Mutant-type',
-        'Chain',
-        'LigandID',
-        'Distancetoligand',
-        'DUETstabilitychange'
-        ]
-
-lines = [line.rstrip('\n') for line in open('../Results/336_mCSM_lig_complex1_output_processed.txt')]
-
-outputs = defaultdict(list)
-
-for item in lines:
-	col, val = item.split(':')
-	outputs[col].append(val)
-
-dfOut=pd.DataFrame(outputs)
-
-pd.DataFrame.to_csv(dfOut,'../Results/336_complex1_formatted_results.csv', columns=outCols)
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_results_format_df.py
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_results_format_df.py
@ -0,0 +1,61 @@
+#!/usr/bin/python
+
+###################
+# load libraries
+import os, sys
+import pandas as pd
+from collections import defaultdict
+####################
+
+#********************************************************************
+# TASK: Formatting results with nice colnames
+# step3a processed the mcsm results to remove all newlines and 
+# brought data in a format where the delimiter ":" splits
+# data into a convenient format of "colname": "value".
+# this script formats the data and outputs a df with each row
+# as a mutation and its corresponding mcsm_values
+
+# Requirements:
+# input: output of step3a, file containing  "..._output_processed.txt"
+	# path: "Data/<drug>/input/processed/<filename>"
+# output: formatted .csv file
+	# path: "Data/<drug>/input/processed/<filename>"
+#***********************************************************************
+# specify variables for input and output paths and filenames
+homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
+
+basedir = "/git/Data/pyrazinamide/input"
+inpath = "/processed"
+in_filename = "/complex1_output_processed.txt"
+infile = homedir + basedir + inpath + in_filename
+print("Input file is:", infile)
+
+outpath = "/processed"
+out_filename = "/complex1_formatted_results.csv"
+outfile = homedir + basedir + outpath + out_filename
+print("Output file is:", outfile)
+# end of variable assignment for input and output files
+
+outCols=[
+        'PredictedAffinityChange',
+        'Mutationinformation',
+        'Wild-type',
+        'Position',
+        'Mutant-type',
+        'Chain',
+        'LigandID',
+        'Distancetoligand',
+        'DUETstabilitychange'
+        ]
+
+lines = [line.rstrip('\n') for line in open(infile)]
+
+outputs = defaultdict(list)
+
+for item in lines:
+	col, val = item.split(':')
+	outputs[col].append(val)
+
+dfOut=pd.DataFrame(outputs)
+
+pd.DataFrame.to_csv(dfOut, outfile, columns=outCols)
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_results_cleaning.R
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_results_cleaning.R
@ -1,22 +1,42 @@
 getwd()
-#setwd("~/Documents/git/LSHTM_analysis/mcsm_complex1/Results") # work
-setwd("~/git/LSHTM_analysis/mcsm_complex1/Results") # thinkpad
-#setwd("/Users/tanu/git/LSHTM_analysis/mcsm_complex1/Results") # mac
+#setwd("~/git/LSHTM_analysis/mcsm_complex1/Results")
 getwd()

 #=======================================================
+# TASK: read formatted_results_df.csv to complete 
+# missing info, adding DUET categories, assigning
+# meaningful colnames, etc.
+
+# Requirements:
+# input: output of step3b, python processing,
+  # path: Data/<drug>/input/processed/<filename>"
+# output: NO output as the next scripts refers to this
+# for yet more processing
+#=======================================================
+
+# specify variables for input and output paths and filenames
+homedir = "~"
+basedir = "/git/Data/pyrazinamide/input"
+inpath = "/processed"
+in_filename = "/complex1_formatted_results.csv"
+infile = paste0(homedir, basedir, inpath, in_filename)
+print(paste0("Input file is:", infile))
+
+#======================================================
 #TASK: To tidy the columns so you can generate figures
 #=======================================================
 ####################
 #### read file #####: this will be the output from python script (csv file)
 ####################
-data = read.csv("336_complex1_formatted_results.csv"
+data = read.csv(infile
              , header = T
              , stringsAsFactors = FALSE)
 dim(data)
-#335, 10
 str(data)

+# clear variables
+rm(homedir, basedir, inpath, in_filename, infile)
+
 ###########################
 ##### Data processing #####
 ###########################
@ -31,41 +51,38 @@ data$Mutationinformation = paste0(data$Wild.type, data$Position, data$Mutant.typ
 head(data$Mutationinformation)
 tail(data$Mutationinformation)
 #write.csv(data, 'test.csv')
+
 ##########################################
 # Remove duplicate SNPs as a sanity check
 ##########################################
-#very important
+# very important
 table(duplicated(data$Mutationinformation))
-#FALSE   
-#335

-#extract duplicated entries
+# extract duplicated entries
 dups = data[duplicated(data$Mutationinformation),] #0

-#No of dups should match with the no. of TRUE in the above table 
+# No of dups should match with the no. of TRUE in the above table 
 #u_dups = unique(dups$Mutationinformation) #10
-sum( table(dups$Mutationinformation) ) #13
-
-rm(dups)
+sum( table(dups$Mutationinformation) )

 #***************************************************************
-#select non-duplicated SNPs and create a new df
-df = data[!duplicated(data$Mutationinformation),] #309, 10
+# select non-duplicated SNPs and create a new df
+df = data[!duplicated(data$Mutationinformation),]
 #***************************************************************
-#sanity check
+# sanity check
 u = unique(df$Mutationinformation)
 u2 = unique(data$Mutationinformation)
 table(u%in%u2)
-#TRUE 
-#309 
-#should all be 1, hence 309 1's
+
+# should all be 1
 sum(table(df$Mutationinformation) == 1)

-#sort df by Position
-#MANUAL CHECKPOINT:  
+# sort df by Position
+# MANUAL CHECKPOINT:  
 #foo <- df[order(df$Position),]
 #df <- df[order(df$Position),]

+# clear variables
 rm(u, u2, dups)

 ####################
@ -75,26 +92,28 @@ rm(u, u2, dups)
 #=======
 #STEP 1
 #========
-#make a copy of the PredictedAffinityColumn and call it Lig_outcome
-df$Lig_outcome = df$PredictedAffinityChange #335, 11
+# make a copy of the PredictedAffinityColumn and call it Lig_outcome
+df$Lig_outcome = df$PredictedAffinityChange

-#make Predicted...column numeric and outcome column categorical
+ #make Predicted...column numeric and outcome column categorical
 head(df$PredictedAffinityChange)
 df$PredictedAffinityChange = gsub("log.*"
                                  , ""
                                  , df$PredictedAffinityChange)

-#sanity checks
+# sanity checks
 head(df$PredictedAffinityChange)

-#should be numeric, check and if not make it numeric
-is.numeric( df$PredictedAffinityChange )
-#change to numeric
-df$PredictedAffinityChange = as.numeric(df$PredictedAffinityChange)
-#should be TRUE
+# should be numeric, check and if not make it numeric
 is.numeric( df$PredictedAffinityChange )

-#change the column name to indicate units
+# change to numeric
+df$PredictedAffinityChange = as.numeric(df$PredictedAffinityChange)
+
+# should be TRUE
+is.numeric( df$PredictedAffinityChange )
+
+# change the column name to indicate units
 n = which(colnames(df) == "PredictedAffinityChange"); n
 colnames(df)[n] = "PredAffLog"
 colnames(df)[n]
@ -102,38 +121,44 @@ colnames(df)[n]
 #========
 #STEP 2
 #========
-#make Lig_outcome column categorical showing effect of mutation
+# make Lig_outcome column categorical showing effect of mutation
 head(df$Lig_outcome)
 df$Lig_outcome = gsub("^.*-"
                  , "",
                  df$Lig_outcome)
-#sanity checks
+# sanity checks
 head(df$Lig_outcome)
-#should be factor, check and if not change it to factor
+
+# should be factor, check and if not change it to factor
 is.factor(df$Lig_outcome) 
-#change to factor
+
+# change to factor
 df$Lig_outcome = as.factor(df$Lig_outcome)
-#should be TRUE
+
+# should be TRUE
 is.factor(df$Lig_outcome) 

 #========
 #STEP 3
 #========
-#gsub
+# gsub
 head(df$Distancetoligand)
 df$Distancetoligand = gsub("&Aring;"
                           , ""
                           , df$Distancetoligand)
-#sanity checks
+# sanity checks
 head(df$Distancetoligand)
-#should be numeric, check if not change it to numeric
-is.numeric(df$Distancetoligand)
-#change to numeric
-df$Distancetoligand = as.numeric(df$Distancetoligand)
-#should be TRUE
+
+# should be numeric, check if not change it to numeric
 is.numeric(df$Distancetoligand)

-#change the column name to indicate units
+# change to numeric
+df$Distancetoligand = as.numeric(df$Distancetoligand)
+
+# should be TRUE
+is.numeric(df$Distancetoligand)
+
+# change the column name to indicate units
 n = which(colnames(df) == "Distancetoligand")
 colnames(df)[n] <- "Dis_lig_Ang"
 colnames(df)[n]
@ -146,16 +171,19 @@ head(df$DUETstabilitychange)
 df$DUETstabilitychange = gsub("Kcal/mol"
                              , ""
                              , df$DUETstabilitychange)
-#sanity checks
+# sanity checks
 head(df$DUETstabilitychange)
-#should be numeric, check if not change it to numeric
-is.numeric(df$DUETstabilitychange)
-#change to numeric 
-df$DUETstabilitychange = as.numeric(df$DUETstabilitychange)
-#should be TRUE
+
+# should be numeric, check if not change it to numeric
 is.numeric(df$DUETstabilitychange)

-#change the column name to indicate units
+# change to numeric 
+df$DUETstabilitychange = as.numeric(df$DUETstabilitychange)
+
+# should be TRUE
+is.numeric(df$DUETstabilitychange)
+
+# change the column name to indicate units
 n = which(colnames(df) == "DUETstabilitychange"); n
 colnames(df)[n] = "DUETStability_Kcalpermol"
 colnames(df)[n]
@ -163,25 +191,20 @@ colnames(df)[n]
 #========
 #STEP 5
 #========
-#create yet another extra column: classification of DUET stability only
+# create yet another extra column: classification of DUET stability only
 df$DUET_outcome = ifelse(df$DUETStability_Kcalpermol >=0
                         , "Stabilizing"
-                         , "Destabilizing")  #335, 12
+                         , "Destabilizing") # spelling to be consistent with mcsm

 table(df$Lig_outcome)
-#Destabilizing   Stabilizing 
-#281             54 

 table(df$DUET_outcome)
-#Destabilizing   Stabilizing 
-#288             47 
+
 #==============================
 #FIXME
 #Insert a venn diagram
-
 #================================

-
 #========
 #STEP 6
 #========
@ -198,10 +221,10 @@ colnames(df[mut])
 #========
 #STEP 7
 #========
-#create an extra column: maybe useful for some plots
-df$WildPos = paste0(df$Wild_type, df$Position) #335, 13
+# create an extra column: maybe useful for some plots
+df$WildPos = paste0(df$Wild_type, df$Position)

-#clear variables
+# clear variables
 rm(n, wt, mut)

 ################ end of data cleaning
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_results_normalise.R
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_results_normalise.R
@ -1,25 +1,47 @@
-getwd()
-#setwd("~/Documents/git/LSHTM_analysis/mcsm_complex1/Results") # work
-setwd("~/git/LSHTM_analysis/mcsm_complex1/Results") # thinkpad
-#setwd("/Users/tanu/git/LSHTM_analysis/mcsm_complex1/Results") # mac
+##################
+# load libraries
+ library(compare)
+##################
+
 getwd()

 #=======================================================
-#TASK:read cleaned data and perform rescaling
+# TASK:read cleaned data and perform rescaling
  # of DUET stability scores
  # of Pred affinity
-#compare scaling methods with plots
-#output normalised file
+# compare scaling methods with plots
+
+# Requirements:
+# input: R script, step3c_results_cleaning.R
+  # path: Data/<drug>/input/processed/<filename>"
+# output: NO output as the next scripts refers to this
+# for yet more processing
+# output normalised file
 #=======================================================

+# specify variables for input and output paths and filenames
+homedir = "~"
+currdir = "/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/mcsm"
+in_filename = "/step3c_results_cleaning.R"
+infile = paste0(homedir, currdir, in_filename)
+print(paste0("Input file is:", infile))
+
+# output file
+basedir = "/git/Data/pyrazinamide/input"
+outpath = "/processed"
+out_filename = "/mcsm_complex1_normalised.csv"
+outfile = paste0(homedir, basedir, outpath, out_filename)
+print(paste0("Output file is:", outfile))
+
 ####################
 #### read file #####: this will be the output of my R script that cleans the data columns
 ####################
-source("../Scripts/step3c_data_cleaning.R")
-##This will outut two dataframes:
-##data: unclean data: 335, 10
-##df : cleaned df 335, 13
-## you can remove data if you want as you will not need it
+source(infile)
+
+#This will outut two dataframes:
+# data: unclean data: 10 cols
+# df : cleaned df: 13 cols
+# you can remove data if you want as you will not need it
 rm(data)

 colnames(df)
@ -36,67 +58,60 @@ group = which(colnames(df) == "Lig_outcome"); group
 # This is because this makes it easier to see the results of rescaling for debugging
 head(df$PredAffLog)

-#ORDER BY PredAff scrores: negative  values at the top and positive at the bottoom
+# ORDER BY PredAff scrores: negative  values at the top and positive at the bottoom
 df = df[order(df$PredAffLog),] 
 head(df$PredAffLog)

-#sanity checks
-head(df[,n]) #all negatives
-tail(df[,n]) #all positives
+# sanity checks
+head(df[,n]) # all negatives
+tail(df[,n]) # all positives

-#sanity checks
+# sanity checks
 mean(df[,n])
 #-0.9526746

 tapply(df[,n], df[,group], mean)
-#Destabilizing   Stabilizing 
-#-1.2112100      0.3926667 
+
 #===========================
-#Same as above: in 2 steps
+# Same as above: in 2 steps
 #===========================

-#find range of your data
-my_min = min(df[,n]); my_min #-3.948
-my_max = max(df[,n]); my_max #2.23
+# find range of your data
+my_min = min(df[,n]); my_min #
+my_max = max(df[,n]); my_max #

 #===============================================
 # WITHIN GROUP rescaling 2: method "ratio"
 # create column to store the rescaled values
 # Rescaling separately (Less dangerous) 
-#       =====> chosen one:as Nick prefers
+#       =====> chosen one: preserves sign
 #===============================================
 df$ratioPredAff = ifelse(df[,n] < 0
                      , df[,n]/abs(my_min)
                      , df[,n]/my_max
-                      )#335 14
-#sanity checks
+                      )# 14 cols
+# sanity checks
 head(df$ratioPredAff)
 tail(df$ratioPredAff)

 min(df$ratioPredAff); max(df$ratioPredAff)

 tapply(df$ratioPredAff, df$Lig_outcome, min)
-#Destabilizing   Stabilizing 
-#-1.000000000   0.005381166 

 tapply(df$ratioPredAff, df$Lig_outcome, max)
-#Destabilizing   Stabilizing 
-#-0.001266464   1.000000000

-#should be the same as below (281 and 54)
+# should be the same as below 
 sum(df$ratioPredAff < 0); sum(df$ratioPredAff > 0)

 table(df$Lig_outcome)
-#Destabilizing   Stabilizing 
-#281              54

 #===============================================
 # Hist and density plots to compare the rescaling 
 # methods: Base R
 #===============================================
-#uncomment as necessary
+# uncomment as necessary
 my_title = "Ligand_stability"
-#my_title = colnames(df[n])
+# my_title = colnames(df[n])

 # Set the margin on all sides
 par(oma = c(3,2,3,0)
@ -140,7 +155,7 @@ rm(my_min, my_max, my_title, n, group)
 #===================
 # 3b: DUET stability
 #===================
-dim(df) #335, 14
+dim(df) # 14 cols

 n = which(colnames(df) == "DUETStability_Kcalpermol"); n # 10
 group = which(colnames(df) == "DUET_outcome"); group #12
@ -151,63 +166,53 @@ group = which(colnames(df) == "DUET_outcome"); group #12
 # This is because this makes it easier to see the results of rescaling for debugging
 head(df$DUETStability_Kcalpermol)

-#ORDER BY DUET scores: negative values at the top and positive at the bottom
+# ORDER BY DUET scores: negative values at the top and positive at the bottom
 df = df[order(df$DUETStability_Kcalpermol),] 

-#sanity checks
-head(df[,n]) #negatives
-tail(df[,n]) #positives
+# sanity checks
+head(df[,n]) # negatives
+tail(df[,n]) # positives

-#sanity checks
+# sanity checks
 mean(df[,n])
-#[1] -1.173316

 tapply(df[,n], df[,group], mean)
-#Destabilizing   Stabilizing 
-#-1.4297257     0.3978723

 #===============================================
 # WITHIN GROUP rescaling 2: method "ratio"
 # create column to store the rescaled values
 # Rescaling separately (Less dangerous) 
-#       =====> chosen one:as Nick prefers
+#       =====> chosen one: preserves sign
 #===============================================
-#find range of your data
-my_min = min(df[,n]); my_min #-3.87
-my_max = max(df[,n]); my_max #1.689
+# find range of your data
+my_min = min(df[,n]); my_min 
+my_max = max(df[,n]); my_max

 df$ratioDUET = ifelse(df[,n] < 0
                      , df[,n]/abs(my_min)
                      , df[,n]/my_max
-                    ) #335, 15
-#sanity check
+                    ) # 15 cols
+# sanity check
 head(df$ratioDUET)
 tail(df$ratioDUET)

 min(df$ratioDUET); max(df$ratioDUET)

-#sanity checks
+# sanity checks
 tapply(df$ratioDUET, df$DUET_outcome, min)
-#Destabilizing   Stabilizing 
-#-1.00000000    0.01065719

 tapply(df$ratioDUET, df$DUET_outcome, max)
-#Destabilizing   Stabilizing 
-#-0.003875969   1.000000000 

-#should be the same as below (267 and 42)
+# should be the same as below (267 and 42)
 sum(df$ratioDUET < 0); sum(df$ratioDUET > 0)

 table(df$DUET_outcome)
-#Destabilizing   Stabilizing 
-#288             47

 #===============================================
 # Hist and density plots to compare the rescaling 
 # methods: Base R
 #===============================================
-#uncomment as necessary
-
+# uncomment as necessary
 my_title = "DUET_stability"
 #my_title = colnames(df[n])

@ -246,7 +251,25 @@ mtext(text = my_title
      , line = 0
      , outer = TRUE)

+# reorder by column name
+#data <- data[c("A", "B", "C")]
+colnames(df)
+df2 = df[c("X", "Mutationinformation",  "WildPos", "Position"
+           , "Wild_type", "Mutant_type"
+           , "DUETStability_Kcalpermol", "DUET_outcome"
+           , "Dis_lig_Ang", "PredAffLog", "Lig_outcome"
+           , "ratioDUET", "ratioPredAff"
+           , "LigandID","Chain")]
+
+# sanity check
+# should be True
+#compare(df, df2, allowAll = T)
+compare(df, df2, ignoreColOrder = T)
+#TRUE 
+#reordered columns
+
 #===================
 # write output as csv file
 #===================
-write.csv(df, "../Data/mcsm_complex1_normalised.csv", row.names = FALSE) #335, 15
+#write.csv(df, "../Data/mcsm_complex1_normalised.csv", row.names = FALSE)
+write.csv(df2, outfile, row.names = FALSE)