renamed files with shorter names and corrected paths for input and output files

2020-01-13 10:39:49 +00:00 · 2020-01-13 10:39:49 +00:00 · 61fcd14b17
commit 61fcd14b17
parent 4be0de97d7
8 changed files with 244 additions and 164 deletions
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/run.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/run.sh
@ -1,6 +1,9 @@
-# run step0-step3a for mcsm pipeline
+#!/bin/bash
 # run all bash scripts for mcsm
 #./step0_check_duplicate_SNPs.sh
-#./step1_mCSMLig_curl_submit_store_outputurl.sh
+#./step1_lig_output_urls.sh
-./step2_mCSM_LIG_batch_outputurls_results.sh
+./step2_lig_results.sh
-./step3a_mCSM_LIG_regex_output_formatting.sh
+./step3a_results_format_interim.sh
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_mCSMLig_curl_submit_store_outputurl.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step1_mCSMLig_curl_submit_store_outputurl.sh
@ -24,7 +24,7 @@ infile_mut="/pnca_mis_SNPs_v2_unique.csv"
 infile_struc="/complex1_no_water.pdb"
 outpath="${inpath}${processed_path}"
-outfile="/mCSM_lig_complex1_result_url.txt"
+outfile="/complex1_result_url.txt"
 # create valid input and output filenames
 #filename="${HOME}/git/Data/pyrazinamide/input/processed/pnca_mis_SNPs_v2_unique.csv"
@ -82,7 +82,7 @@ echo -e "${mutation} : processing entry ${COUNT}/$(wc -l < ${filename})..."
 # create output file with the added number of muts from file
 # after much thought, bad idea as less generic!
-#echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_mCSM_lig_complex1_result_url.txt
+#echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_complex1_result_url.txt
 echo -e "${host}${result_url}" >> ${outfilename}
 #echo -n '.'
 done < "${filename}"
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_mCSM_LIG_batch_outputurls_results.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step2_mCSM_LIG_batch_outputurls_results.sh
@ -35,10 +35,10 @@
 # specify variables for input and output paths and filenames
 inpath="${HOME}/git/Data/pyrazinamide/input"
 processed_path="/processed"
-infile="/mCSM_lig_complex1_result_url.txt"
+infile="/complex1_result_url.txt"
 outpath="${inpath}${processed_path}"
-outfile="/mCSM_lig_complex1_output_MASTER.txt"
+outfile="/complex1_output_MASTER.txt"
 # create valid input and output filenames
 filename="${inpath}${processed_path}${infile}"
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_mCSM_LIG_regex_output_formatting.sh
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3a_mCSM_LIG_regex_output_formatting.sh
@ -27,12 +27,11 @@ inpath="${HOME}/git/Data/pyrazinamide/input"
 processed_path="/processed"
 # Create input file: copy and rename output file of step2
-oldfile="${inpath}${processed_path}/mCSM_lig_complex1_output_MASTER.txt"
+oldfile="${inpath}${processed_path}/complex1_output_MASTER.txt"
-newfile="${inpath}${processed_path}/mCSM_lig_complex1_output_processed.txt"
+newfile="${inpath}${processed_path}/complex1_output_processed.txt"
 cp $oldfile $newfile
-#infile="../Results/336_mCSM_lig_complex1_output_processed.txt"
+infile="/complex1_output_processed.txt"
 infile="/mCSM_lig_complex1_output_processed.txt"
 filename="${inpath}${processed_path}${infile}"
 echo Input filename is : ${filename}
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_format_results.py
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_format_results.py
@ -1,29 +0,0 @@
 #!/usr/bin/python
 import pandas as pd
 from collections import defaultdict
 #file = r'../Results/322_mCSM_lig_complex1_output_processed.txt'
 outCols=[
        'PredictedAffinityChange',
        'Mutationinformation',
        'Wild-type',
        'Position',
        'Mutant-type',
        'Chain',
        'LigandID',
        'Distancetoligand',
        'DUETstabilitychange'
        ]
 lines = [line.rstrip('\n') for line in open('../Results/336_mCSM_lig_complex1_output_processed.txt')]
 outputs = defaultdict(list)
 for item in lines:
 	col, val = item.split(':')
 	outputs[col].append(val)
 dfOut=pd.DataFrame(outputs)
 pd.DataFrame.to_csv(dfOut,'../Results/336_complex1_formatted_results.csv', columns=outCols)
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_results_format_df.py
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3b_results_format_df.py
@ -0,0 +1,61 @@
 #!/usr/bin/python
 ###################
 # load libraries
 import os, sys
 import pandas as pd
 from collections import defaultdict
 ####################
 #********************************************************************
 # TASK: Formatting results with nice colnames
 # step3a processed the mcsm results to remove all newlines and 
 # brought data in a format where the delimiter ":" splits
 # data into a convenient format of "colname": "value".
 # this script formats the data and outputs a df with each row
 # as a mutation and its corresponding mcsm_values
 # Requirements:
 # input: output of step3a, file containing  "..._output_processed.txt"
 	# path: "Data/<drug>/input/processed/<filename>"
 # output: formatted .csv file
 	# path: "Data/<drug>/input/processed/<filename>"
 #***********************************************************************
 # specify variables for input and output paths and filenames
 homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
 basedir = "/git/Data/pyrazinamide/input"
 inpath = "/processed"
 in_filename = "/complex1_output_processed.txt"
 infile = homedir + basedir + inpath + in_filename
 print("Input file is:", infile)
 outpath = "/processed"
 out_filename = "/complex1_formatted_results.csv"
 outfile = homedir + basedir + outpath + out_filename
 print("Output file is:", outfile)
 # end of variable assignment for input and output files
 outCols=[
        'PredictedAffinityChange',
        'Mutationinformation',
        'Wild-type',
        'Position',
        'Mutant-type',
        'Chain',
        'LigandID',
        'Distancetoligand',
        'DUETstabilitychange'
        ]
 lines = [line.rstrip('\n') for line in open(infile)]
 outputs = defaultdict(list)
 for item in lines:
 	col, val = item.split(':')
 	outputs[col].append(val)
 dfOut=pd.DataFrame(outputs)
 pd.DataFrame.to_csv(dfOut, outfile, columns=outCols)
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_results_cleaning.R
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step3c_results_cleaning.R
@ -1,22 +1,42 @@
 getwd()
-#setwd("~/Documents/git/LSHTM_analysis/mcsm_complex1/Results") # work
+#setwd("~/git/LSHTM_analysis/mcsm_complex1/Results")
 setwd("~/git/LSHTM_analysis/mcsm_complex1/Results") # thinkpad
 #setwd("/Users/tanu/git/LSHTM_analysis/mcsm_complex1/Results") # mac
 getwd()
 #=======================================================
 # TASK: read formatted_results_df.csv to complete 
 # missing info, adding DUET categories, assigning
 # meaningful colnames, etc.
 # Requirements:
 # input: output of step3b, python processing,
  # path: Data/<drug>/input/processed/<filename>"
 # output: NO output as the next scripts refers to this
 # for yet more processing
 #=======================================================
 # specify variables for input and output paths and filenames
 homedir = "~"
 basedir = "/git/Data/pyrazinamide/input"
 inpath = "/processed"
 in_filename = "/complex1_formatted_results.csv"
 infile = paste0(homedir, basedir, inpath, in_filename)
 print(paste0("Input file is:", infile))
 #======================================================
 #TASK: To tidy the columns so you can generate figures
 #=======================================================
 ####################
 #### read file #####: this will be the output from python script (csv file)
 ####################
-data = read.csv("336_complex1_formatted_results.csv"
+data = read.csv(infile
              , header = T
              , stringsAsFactors = FALSE)
 dim(data)
 #335, 10
 str(data)
 # clear variables
 rm(homedir, basedir, inpath, in_filename, infile)
 ###########################
 ##### Data processing #####
 ###########################
@ -31,41 +51,38 @@ data$Mutationinformation = paste0(data$Wild.type, data$Position, data$Mutant.typ
 head(data$Mutationinformation)
 tail(data$Mutationinformation)
 #write.csv(data, 'test.csv')
 ##########################################
 # Remove duplicate SNPs as a sanity check
 ##########################################
-#very important
+# very important
 table(duplicated(data$Mutationinformation))
 #FALSE   
 #335
-#extract duplicated entries
+# extract duplicated entries
 dups = data[duplicated(data$Mutationinformation),] #0
-#No of dups should match with the no. of TRUE in the above table 
+# No of dups should match with the no. of TRUE in the above table 
 #u_dups = unique(dups$Mutationinformation) #10
-sum( table(dups$Mutationinformation) ) #13
+sum( table(dups$Mutationinformation) )
 rm(dups)
 #***************************************************************
-#select non-duplicated SNPs and create a new df
+# select non-duplicated SNPs and create a new df
-df = data[!duplicated(data$Mutationinformation),] #309, 10
+df = data[!duplicated(data$Mutationinformation),]
 #***************************************************************
-#sanity check
+# sanity check
 u = unique(df$Mutationinformation)
 u2 = unique(data$Mutationinformation)
 table(u%in%u2)
-#TRUE 
+
-#309 
+# should all be 1
 #should all be 1, hence 309 1's
 sum(table(df$Mutationinformation) == 1)
-#sort df by Position
+# sort df by Position
-#MANUAL CHECKPOINT:  
+# MANUAL CHECKPOINT:  
 #foo <- df[order(df$Position),]
 #df <- df[order(df$Position),]
 # clear variables
 rm(u, u2, dups)
 ####################
@ -75,26 +92,28 @@ rm(u, u2, dups)
 #=======
 #STEP 1
 #========
-#make a copy of the PredictedAffinityColumn and call it Lig_outcome
+# make a copy of the PredictedAffinityColumn and call it Lig_outcome
-df$Lig_outcome = df$PredictedAffinityChange #335, 11
+df$Lig_outcome = df$PredictedAffinityChange
-#make Predicted...column numeric and outcome column categorical
+ #make Predicted...column numeric and outcome column categorical
 head(df$PredictedAffinityChange)
 df$PredictedAffinityChange = gsub("log.*"
                                  , ""
                                  , df$PredictedAffinityChange)
-#sanity checks
+# sanity checks
 head(df$PredictedAffinityChange)
-#should be numeric, check and if not make it numeric
+# should be numeric, check and if not make it numeric
 is.numeric( df$PredictedAffinityChange )
 #change to numeric
 df$PredictedAffinityChange = as.numeric(df$PredictedAffinityChange)
 #should be TRUE
 is.numeric( df$PredictedAffinityChange )
-#change the column name to indicate units
+# change to numeric
 df$PredictedAffinityChange = as.numeric(df$PredictedAffinityChange)
 # should be TRUE
 is.numeric( df$PredictedAffinityChange )
 # change the column name to indicate units
 n = which(colnames(df) == "PredictedAffinityChange"); n
 colnames(df)[n] = "PredAffLog"
 colnames(df)[n]
@ -102,38 +121,44 @@ colnames(df)[n]
 #========
 #STEP 2
 #========
-#make Lig_outcome column categorical showing effect of mutation
+# make Lig_outcome column categorical showing effect of mutation
 head(df$Lig_outcome)
 df$Lig_outcome = gsub("^.*-"
                  , "",
                  df$Lig_outcome)
-#sanity checks
+# sanity checks
 head(df$Lig_outcome)
-#should be factor, check and if not change it to factor
+
 # should be factor, check and if not change it to factor
 is.factor(df$Lig_outcome) 
-#change to factor
+
 # change to factor
 df$Lig_outcome = as.factor(df$Lig_outcome)
-#should be TRUE
+
 # should be TRUE
 is.factor(df$Lig_outcome) 
 #========
 #STEP 3
 #========
-#gsub
+# gsub
 head(df$Distancetoligand)
 df$Distancetoligand = gsub("&Aring;"
                           , ""
                           , df$Distancetoligand)
-#sanity checks
+# sanity checks
 head(df$Distancetoligand)
-#should be numeric, check if not change it to numeric
+
-is.numeric(df$Distancetoligand)
+# should be numeric, check if not change it to numeric
 #change to numeric
 df$Distancetoligand = as.numeric(df$Distancetoligand)
 #should be TRUE
 is.numeric(df$Distancetoligand)
-#change the column name to indicate units
+# change to numeric
 df$Distancetoligand = as.numeric(df$Distancetoligand)
 # should be TRUE
 is.numeric(df$Distancetoligand)
 # change the column name to indicate units
 n = which(colnames(df) == "Distancetoligand")
 colnames(df)[n] <- "Dis_lig_Ang"
 colnames(df)[n]
@ -146,16 +171,19 @@ head(df$DUETstabilitychange)
 df$DUETstabilitychange = gsub("Kcal/mol"
                              , ""
                              , df$DUETstabilitychange)
-#sanity checks
+# sanity checks
 head(df$DUETstabilitychange)
-#should be numeric, check if not change it to numeric
+
-is.numeric(df$DUETstabilitychange)
+# should be numeric, check if not change it to numeric
 #change to numeric 
 df$DUETstabilitychange = as.numeric(df$DUETstabilitychange)
 #should be TRUE
 is.numeric(df$DUETstabilitychange)
-#change the column name to indicate units
+# change to numeric 
 df$DUETstabilitychange = as.numeric(df$DUETstabilitychange)
 # should be TRUE
 is.numeric(df$DUETstabilitychange)
 # change the column name to indicate units
 n = which(colnames(df) == "DUETstabilitychange"); n
 colnames(df)[n] = "DUETStability_Kcalpermol"
 colnames(df)[n]
@ -163,25 +191,20 @@ colnames(df)[n]
 #========
 #STEP 5
 #========
-#create yet another extra column: classification of DUET stability only
+# create yet another extra column: classification of DUET stability only
 df$DUET_outcome = ifelse(df$DUETStability_Kcalpermol >=0
                         , "Stabilizing"
-                         , "Destabilizing")  #335, 12
+                         , "Destabilizing") # spelling to be consistent with mcsm
 table(df$Lig_outcome)
 #Destabilizing   Stabilizing 
 #281             54 
 table(df$DUET_outcome)
-#Destabilizing   Stabilizing 
+
 #288             47 
 #==============================
 #FIXME
 #Insert a venn diagram
 #================================
 #========
 #STEP 6
 #========
@ -198,10 +221,10 @@ colnames(df[mut])
 #========
 #STEP 7
 #========
-#create an extra column: maybe useful for some plots
+# create an extra column: maybe useful for some plots
-df$WildPos = paste0(df$Wild_type, df$Position) #335, 13
+df$WildPos = paste0(df$Wild_type, df$Position)
-#clear variables
+# clear variables
 rm(n, wt, mut)
 ################ end of data cleaning
--- a/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_results_normalise.R
+++ b/mcsm_analysis/pyrazinamide/scripts/mcsm/step4_results_normalise.R
@ -1,25 +1,47 @@
-getwd()
+##################
-#setwd("~/Documents/git/LSHTM_analysis/mcsm_complex1/Results") # work
+# load libraries
-setwd("~/git/LSHTM_analysis/mcsm_complex1/Results") # thinkpad
+ library(compare)
-#setwd("/Users/tanu/git/LSHTM_analysis/mcsm_complex1/Results") # mac
+##################
 getwd()
 #=======================================================
-#TASK:read cleaned data and perform rescaling
+# TASK:read cleaned data and perform rescaling
  # of DUET stability scores
  # of Pred affinity
-#compare scaling methods with plots
+# compare scaling methods with plots
-#output normalised file
+
 # Requirements:
 # input: R script, step3c_results_cleaning.R
  # path: Data/<drug>/input/processed/<filename>"
 # output: NO output as the next scripts refers to this
 # for yet more processing
 # output normalised file
 #=======================================================
 # specify variables for input and output paths and filenames
 homedir = "~"
 currdir = "/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/mcsm"
 in_filename = "/step3c_results_cleaning.R"
 infile = paste0(homedir, currdir, in_filename)
 print(paste0("Input file is:", infile))
 # output file
 basedir = "/git/Data/pyrazinamide/input"
 outpath = "/processed"
 out_filename = "/mcsm_complex1_normalised.csv"
 outfile = paste0(homedir, basedir, outpath, out_filename)
 print(paste0("Output file is:", outfile))
 ####################
 #### read file #####: this will be the output of my R script that cleans the data columns
 ####################
-source("../Scripts/step3c_data_cleaning.R")
+source(infile)
-##This will outut two dataframes:
+
-##data: unclean data: 335, 10
+#This will outut two dataframes:
-##df : cleaned df 335, 13
+# data: unclean data: 10 cols
-## you can remove data if you want as you will not need it
+# df : cleaned df: 13 cols
 # you can remove data if you want as you will not need it
 rm(data)
 colnames(df)
@ -36,67 +58,60 @@ group = which(colnames(df) == "Lig_outcome"); group
 # This is because this makes it easier to see the results of rescaling for debugging
 head(df$PredAffLog)
-#ORDER BY PredAff scrores: negative  values at the top and positive at the bottoom
+# ORDER BY PredAff scrores: negative  values at the top and positive at the bottoom
 df = df[order(df$PredAffLog),] 
 head(df$PredAffLog)
-#sanity checks
+# sanity checks
-head(df[,n]) #all negatives
+head(df[,n]) # all negatives
-tail(df[,n]) #all positives
+tail(df[,n]) # all positives
-#sanity checks
+# sanity checks
 mean(df[,n])
 #-0.9526746
 tapply(df[,n], df[,group], mean)
-#Destabilizing   Stabilizing 
+
 #-1.2112100      0.3926667 
 #===========================
-#Same as above: in 2 steps
+# Same as above: in 2 steps
 #===========================
-#find range of your data
+# find range of your data
-my_min = min(df[,n]); my_min #-3.948
+my_min = min(df[,n]); my_min #
-my_max = max(df[,n]); my_max #2.23
+my_max = max(df[,n]); my_max #
 #===============================================
 # WITHIN GROUP rescaling 2: method "ratio"
 # create column to store the rescaled values
 # Rescaling separately (Less dangerous) 
-#       =====> chosen one:as Nick prefers
+#       =====> chosen one: preserves sign
 #===============================================
 df$ratioPredAff = ifelse(df[,n] < 0
                      , df[,n]/abs(my_min)
                      , df[,n]/my_max
-                      )#335 14
+                      )# 14 cols
-#sanity checks
+# sanity checks
 head(df$ratioPredAff)
 tail(df$ratioPredAff)
 min(df$ratioPredAff); max(df$ratioPredAff)
 tapply(df$ratioPredAff, df$Lig_outcome, min)
 #Destabilizing   Stabilizing 
 #-1.000000000   0.005381166 
 tapply(df$ratioPredAff, df$Lig_outcome, max)
 #Destabilizing   Stabilizing 
 #-0.001266464   1.000000000
-#should be the same as below (281 and 54)
+# should be the same as below 
 sum(df$ratioPredAff < 0); sum(df$ratioPredAff > 0)
 table(df$Lig_outcome)
 #Destabilizing   Stabilizing 
 #281              54
 #===============================================
 # Hist and density plots to compare the rescaling 
 # methods: Base R
 #===============================================
-#uncomment as necessary
+# uncomment as necessary
 my_title = "Ligand_stability"
-#my_title = colnames(df[n])
+# my_title = colnames(df[n])
 # Set the margin on all sides
 par(oma = c(3,2,3,0)
@ -140,7 +155,7 @@ rm(my_min, my_max, my_title, n, group)
 #===================
 # 3b: DUET stability
 #===================
-dim(df) #335, 14
+dim(df) # 14 cols
 n = which(colnames(df) == "DUETStability_Kcalpermol"); n # 10
 group = which(colnames(df) == "DUET_outcome"); group #12
@ -151,63 +166,53 @@ group = which(colnames(df) == "DUET_outcome"); group #12
 # This is because this makes it easier to see the results of rescaling for debugging
 head(df$DUETStability_Kcalpermol)
-#ORDER BY DUET scores: negative values at the top and positive at the bottom
+# ORDER BY DUET scores: negative values at the top and positive at the bottom
 df = df[order(df$DUETStability_Kcalpermol),] 
-#sanity checks
+# sanity checks
-head(df[,n]) #negatives
+head(df[,n]) # negatives
-tail(df[,n]) #positives
+tail(df[,n]) # positives
-#sanity checks
+# sanity checks
 mean(df[,n])
 #[1] -1.173316
 tapply(df[,n], df[,group], mean)
 #Destabilizing   Stabilizing 
 #-1.4297257     0.3978723
 #===============================================
 # WITHIN GROUP rescaling 2: method "ratio"
 # create column to store the rescaled values
 # Rescaling separately (Less dangerous) 
-#       =====> chosen one:as Nick prefers
+#       =====> chosen one: preserves sign
 #===============================================
-#find range of your data
+# find range of your data
-my_min = min(df[,n]); my_min #-3.87
+my_min = min(df[,n]); my_min 
-my_max = max(df[,n]); my_max #1.689
+my_max = max(df[,n]); my_max
 df$ratioDUET = ifelse(df[,n] < 0
                      , df[,n]/abs(my_min)
                      , df[,n]/my_max
-                    ) #335, 15
+                    ) # 15 cols
-#sanity check
+# sanity check
 head(df$ratioDUET)
 tail(df$ratioDUET)
 min(df$ratioDUET); max(df$ratioDUET)
-#sanity checks
+# sanity checks
 tapply(df$ratioDUET, df$DUET_outcome, min)
 #Destabilizing   Stabilizing 
 #-1.00000000    0.01065719
 tapply(df$ratioDUET, df$DUET_outcome, max)
 #Destabilizing   Stabilizing 
 #-0.003875969   1.000000000 
-#should be the same as below (267 and 42)
+# should be the same as below (267 and 42)
 sum(df$ratioDUET < 0); sum(df$ratioDUET > 0)
 table(df$DUET_outcome)
 #Destabilizing   Stabilizing 
 #288             47
 #===============================================
 # Hist and density plots to compare the rescaling 
 # methods: Base R
 #===============================================
-#uncomment as necessary
+# uncomment as necessary
 my_title = "DUET_stability"
 #my_title = colnames(df[n])
@ -246,7 +251,25 @@ mtext(text = my_title
      , line = 0
      , outer = TRUE)
 # reorder by column name
 #data <- data[c("A", "B", "C")]
 colnames(df)
 df2 = df[c("X", "Mutationinformation",  "WildPos", "Position"
           , "Wild_type", "Mutant_type"
           , "DUETStability_Kcalpermol", "DUET_outcome"
           , "Dis_lig_Ang", "PredAffLog", "Lig_outcome"
           , "ratioDUET", "ratioPredAff"
           , "LigandID","Chain")]
 # sanity check
 # should be True
 #compare(df, df2, allowAll = T)
 compare(df, df2, ignoreColOrder = T)
 #TRUE 
 #reordered columns
 #===================
 # write output as csv file
 #===================
-write.csv(df, "../Data/mcsm_complex1_normalised.csv", row.names = FALSE) #335, 15
+#write.csv(df, "../Data/mcsm_complex1_normalised.csv", row.names = FALSE)
 write.csv(df2, outfile, row.names = FALSE)