From 2c013124ad9b59e1d4a43c5b9facfe5d5c2fcbee Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Mon, 21 Sep 2020 17:54:54 +0100 Subject: [PATCH] updated gitignore to tidyup --- .gitignore | 1 + meta_data_analysis/AF_and_OR_calcs.R | 398 --------------- meta_data_analysis/combine_dfs.py | 248 ---------- meta_data_analysis/combining_df_lig.R | 397 --------------- meta_data_analysis/combining_df_ps.R | 461 ------------------ meta_data_analysis/dssp_df.py | 170 ------- meta_data_analysis/init_data_dirs.py | 36 -- meta_data_analysis/kd_df.py | 193 -------- .../mut_electrostatic_changes.py | 167 ------- meta_data_analysis/mutate.py | 179 ------- meta_data_analysis/rd_df.py | 135 ----- meta_data_analysis/run_mutate.sh | 18 - meta_data_analysis/run_pdb_dssp.sh | 54 -- plotting_test/logo_plot.R | 221 --------- 14 files changed, 1 insertion(+), 2677 deletions(-) delete mode 100644 meta_data_analysis/AF_and_OR_calcs.R delete mode 100755 meta_data_analysis/combine_dfs.py delete mode 100644 meta_data_analysis/combining_df_lig.R delete mode 100644 meta_data_analysis/combining_df_ps.R delete mode 100755 meta_data_analysis/dssp_df.py delete mode 100755 meta_data_analysis/init_data_dirs.py delete mode 100644 meta_data_analysis/kd_df.py delete mode 100755 meta_data_analysis/mut_electrostatic_changes.py delete mode 100644 meta_data_analysis/mutate.py delete mode 100755 meta_data_analysis/rd_df.py delete mode 100755 meta_data_analysis/run_mutate.sh delete mode 100755 meta_data_analysis/run_pdb_dssp.sh delete mode 100644 plotting_test/logo_plot.R diff --git a/.gitignore b/.gitignore index 2f8afad..c1ac04d 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ __pycache__ */__pycache__ mcsm_analysis_fixme +meta_data_analysis del examples example diff --git a/meta_data_analysis/AF_and_OR_calcs.R b/meta_data_analysis/AF_and_OR_calcs.R deleted file mode 100644 index f5c9db6..0000000 --- a/meta_data_analysis/AF_and_OR_calcs.R +++ /dev/null @@ -1,398 +0,0 @@ -######################################################### -# TASK: To calculate Allele Frequency and -# Odds Ratio from master data -# and add the calculated params to meta_data extracted from -# data_extraction.py -######################################################### -getwd() -setwd('~/git/LSHTM_analysis/meta_data_analysis') -getwd() - -#%% variable assignment: input and output paths & filenames -drug = 'pyrazinamide' -gene = 'pncA' -gene_match = paste0(gene,'_p.') -cat(gene_match) - -#=========== -# input -#=========== -# infile1: Raw data -#indir = 'git/Data/pyrazinamide/input/original' -indir = paste0('~/git/Data') -in_filename = 'original_tanushree_data_v2.csv' -infile = paste0(indir, '/', in_filename) -cat(paste0('Reading infile1: raw data', ' ', infile) ) - -# infile2: gene associated meta data file to extract valid snps and add calcs to. -# This is outfile3 from data_extraction.py -indir_metadata = paste0('~/git/Data', '/', drug, '/', 'output') -in_filename_metadata = 'pnca_metadata.csv' -infile_metadata = paste0(indir_metadata, '/', in_filename_metadata) -cat(paste0('Reading infile2: gene associated metadata:', infile_metadata)) - -#=========== -# output -#=========== -# outdir = 'git/Data/pyrazinamide/output' -outdir = paste0('~/git/Data', '/', drug, '/', 'output') -out_filename = paste0(tolower(gene),'_', 'meta_data_with_AFandOR.csv') -outfile = paste0(outdir, '/', out_filename) -cat(paste0('Output file with full path:', outfile)) -#%% end of variable assignment for input and output files - -######################################################### -# 1: Read master/raw data stored in Data/ -######################################################### -raw_data_all = read.csv(infile, stringsAsFactors = F) - -raw_data = raw_data_all[,c("id" - , "pyrazinamide" - , "dr_mutations_pyrazinamide" - , "other_mutations_pyrazinamide")] -rm(raw_data_all) - -rm(indir, in_filename, infile) - -#=========== -# 1a: exclude na -#=========== -raw_data = raw_data[!is.na(raw_data$pyrazinamide),] - -total_samples = length(unique(raw_data$id)) -cat(paste0('Total samples without NA in', ' ', drug, 'is:', total_samples)) - -# sanity check: should be true -is.numeric(total_samples) - -#=========== -# 1b: combine the two mutation columns -#=========== -raw_data$all_mutations_pyrazinamide = paste(raw_data$dr_mutations_pyrazinamide - , raw_data$other_mutations_pyrazinamide) -head(raw_data$all_mutations_pyrazinamide) - -#=========== -# 1c: create yet another column that contains all the mutations but in lower case -#=========== -raw_data$all_muts_pnca = tolower(raw_data$all_mutations_pyrazinamide) - -# sanity checks -#table(grepl("pnca_p",raw_data$all_muts_pnca)) -cat(paste0('converting gene match:', gene_match, ' ', 'to lowercase')) -gene_match = tolower(gene_match) - -table(grepl(gene_match,raw_data$all_muts_pnca)) - -# sanity check: should be TRUE -#sum(table(grepl("pnca_p",raw_data$all_muts_pnca))) == total_samples -sum(table(grepl(gene_match,raw_data$all_muts_pnca))) == total_samples - -# set up variables: can be used for logistic regression as well -i = "pnca_p.ala134gly" # has a NA, should NOT exist -table(grepl(i,raw_data$all_muts_pnca)) - -i = "pnca_p.trp68gly" -table(grepl(i,raw_data$all_muts_pnca)) - -mut = grepl(i,raw_data$all_muts_pnca) -dst = raw_data$pyrazinamide -table(mut, dst) - -#chisq.test(table(mut,dst)) -#fisher.test(table(mut, dst)) -#table(mut) - -######################################################### -# 2: Read valid snps for which OR -# can be calculated (infile_comp_snps.csv) -######################################################### -cat(paste0('Reading metadata infile:', infile_metadata)) - -pnca_metadata = read.csv(infile_metadata -# , file.choose() - , stringsAsFactors = F - , header = T) - - -# clear variables -rm(indir, in_filename, infile) -rm(indir_metadata, in_filename_metadata, infile_metadata) - -# count na in pyrazinamide column -tot_pza_na = sum(is.na(pnca_metadata$pyrazinamide)) -expected_rows = nrow(pnca_metadata) - tot_pza_na - -# drop na from the pyrazinamide colum -pnca_snps_or = pnca_metadata[!is.na(pnca_metadata$pyrazinamide),] - -# sanity check -if(nrow(pnca_snps_or) == expected_rows){ - cat('PASS: no. of rows match with expected_rows') -} else{ - cat('FAIL: nrows mismatch.') -} - -# extract unique snps to iterate over for AF and OR calcs -pnca_snps_unique = unique(pnca_snps_or$mutation) - -cat(paste0('Total no. of distinct comp snps to perform OR calcs: ', length(pnca_snps_unique))) - -# Define OR function -x = as.numeric(mut) -y = dst -or = function(x,y){ - tab = as.matrix(table(x,y)) - a = tab[2,2] - if (a==0){ a<-0.5} - b = tab[2,1] - if (b==0){ b<-0.5} - c = tab[1,2] - if (c==0){ c<-0.5} - d = tab[1,1] - if (d==0){ d<-0.5} - (a/b)/(c/d) - - } - -dst = raw_data$pyrazinamide -ors = sapply(pnca_snps_unique,function(m){ - mut = grepl(m,raw_data$all_muts_pnca) - or(mut,dst) -}) - -ors - -pvals = sapply(pnca_snps_unique,function(m){ - mut = grepl(m,raw_data$all_muts_pnca) - fisher.test(mut,dst)$p.value -}) - -pvals - -afs = sapply(pnca_snps_unique,function(m){ - mut = grepl(m,raw_data$all_muts_pnca) - mean(mut) -}) - -afs - -# check ..hmmm -afs['pnca_p.trp68gly'] -afs['pnca_p.gln10pro'] -afs['pnca_p.leu4ser'] - -plot(density(log(ors))) -plot(-log10(pvals)) -hist(log(ors) - , breaks = 100 - ) - -# sanity check -if (table(names(ors) == names(pvals)) & table(names(ors) == names(afs)) & table(names(pvals) == names(afs)) == length(pnca_snps_unique)){ -cat('PASS: names of ors, pvals and afs match: proceed with combining into a single df') -} else{ - cat('FAIL: names of ors, pvals and afs mismatch') -} - -# combine ors, pvals and afs -cat('Combining calculated params into a df: ors, pvals and afs') - -comb_AF_and_OR = data.frame(ors, pvals, afs) -cat('No. of rows in comb_AF_and_OR: ', nrow(comb_AF_and_OR) - , '\nNo. of cols in comb_AF_and_OR: ', ncol(comb_AF_and_OR)) - -cat('Rownames == mutation: ', head(rownames(comb_AF_and_OR))) - -# add rownames of comb_AF_and_OR as an extra column 'mutation' to allow merging based on this column -comb_AF_and_OR$mutation = rownames(comb_AF_and_OR) - -# sanity check -if (table(rownames(comb_AF_and_OR) == comb_AF_and_OR$mutation)){ - cat('PASS: rownames and mutaion col values match') -}else{ - cat('FAIL: rownames and mutation col values mismatch') -} - -######################################################### -# 3: Merge meta data file + calculated num params -######################################################### -df1 = pnca_metadata -df2 = comb_AF_and_OR - -cat('checking commom col of the two dfs before merging:' - ,'\ndf1:', head(df1$mutation) - , '\ndf2:', head(df2$mutation)) - -cat(paste0('merging two dfs: ' - ,'\ndf1 (big df i.e. meta data) nrows: ', nrow(df1) - ,'\ndf2 (small df i.e af, or, pval) nrows: ', nrow(df2) - ,'\nexpected rows in merged df: ', nrow(df1) - ,'\nexpected cols in merged_df: ', (ncol(df1) + ncol(df2) - 1))) - -merged_df = merge(df1 # big file - , df2 # small (afor file) - , by = "mutation" - , all.x = T) # because you want all the entries of the meta data - -# sanity check -if(ncol(merged_df) == (ncol(df1) + ncol(df2) - 1)){ - cat(paste0('PASS: no. of cols is as expected: ', ncol(merged_df))) -} else{ - cat('FAIL: no.of cols mistmatch') -} - -# quick check -i = "pnca_p.ala134gly" # has all NAs in pyrazinamide, should be NA in ors, etc. -merged_df[merged_df$mutation == i,] - -# count na in each column -na_count = sapply(merged_df, function(y) sum(length(which(is.na(y))))); na_count - -# check last three cols: should be NA -if ( identical(na_count[[length(na_count)]], na_count[[length(na_count)-1]], na_count[[length(na_count)-2]])){ - cat('PASS: No. of NAs for OR, AF and Pvals are equal as expected', - '\nNo. of NA: ', na_count[[length(na_count)]]) -} else { - cat('FAIL: No. of NAs for OR, AF and Pvals mismatch') -} - -# reassign custom colnames -cat('Assigning custom colnames for the calculated params...') -colnames(merged_df)[colnames(merged_df)== "ors"] <- "OR" -colnames(merged_df)[colnames(merged_df)== "pvals"] <- "pvalue" -colnames(merged_df)[colnames(merged_df)== "afs"] <- "AF" - -colnames(merged_df) - -# add 3 more cols: log OR, neglog pvalue and AF_percent cols -merged_df$logor = log(merged_df$OR) -is.numeric(merged_df$logor) - -merged_df$neglog10pvalue = -log10(merged_df$pvalue) -is.numeric(merged_df$neglog10pvalue) - -merged_df$AF_percent = merged_df$AF*100 -is.numeric(merged_df$AF_percent) - -# check AFs -#i = 'pnca_p.trp68gly' -i = 'pnca_p.gln10pro' -#i = 'pnca_p.leu4ser' -merged_df[merged_df$mutation == i,] - -# FIXME: harcoding (beware!), NOT FATAL though! -ncol_added = 3 - -cat(paste0('Added', ' ', ncol_added, ' more cols to merged_df:' - , '\ncols added: logor, neglog10pvalue and AF_percent:' - , '\nno. of cols in merged_df now: ', ncol(merged_df))) - -#%% write file out: pnca_meta_data_with_AFandOR -#********************************************* -cat(paste0('writing output file: ' - , '\nFilename: ', out_filename - , '\nPath:', outdir)) - -write.csv(merged_df, outfile - , row.names = F) - -cat(paste0('Finished writing:' - , out_filename - , '\nNo. of rows: ', nrow(merged_df) - , '\nNo. of cols: ', ncol(merged_df))) -#************************************************ -cat('======================================================================') -rm(out_filename) -cat('End of script: calculated AF, OR, pvalues and saved file') -# End of script -#%% -# sanity check: Count NA in these four cols. -# However these need to be numeric else these -# will be misleading when counting NAs (i.e retrun 0) -#is.numeric(meta_with_afor$OR) -na_var = c('AF', 'OR', 'pvalue', 'logor', 'neglog10pvalue') - -# loop through these vars and check if these are numeric. -# if not, then convert to numeric -check_all = NULL - -for (i in na_var){ - # cat(i) - check0 = is.numeric(meta_with_afor[,i]) - if (check0) { - check_all = c(check0, check_all) - cat('These are all numeric cols') - } else{ - cat('First converting to numeric') - check0 = as.numeric(meta_with_afor[,i]) - check_all = c(check0, check_all) - } -} - -# count na now that the respective cols are numeric -na_count = sapply(meta_with_afor, function(y) sum(length(which(is.na(y))))); na_count -str(na_count) - -# extract how many NAs: -# should be all TRUE -# should be a single number since -# all the cols should have 'equal' and 'same' no. of NAs -# compare if the No of 'NA' are the same for all these cols -na_len = NULL -for (i in na_var){ - temp = na_count[[i]] - na_len = c(na_len, temp) -} - -cat('Checking how many NAs and if these are identical for the selected cols:') -my_nrows = NULL -for ( i in 1: (length(na_len)-1) ){ -# cat(compare(na_len[i]), na_len[i+1]) - c = compare(na_len[i], na_len[i+1]) - if ( c$result ) { - cat('PASS: No. of NAa in selected cols are identical') - my_nrows = na_len[i] } - else { - cat('FAIL: No. of NAa in selected cols mismatch') - } -} - -cat('No. of NAs in each of the selected cols: ', my_nrows) - -# yet more sanity checks: -cat('Check whether the ', my_nrows, 'indices are indeed the same') - -#which(is.na(meta_with_afor$OR)) - -# initialise an empty df with nrows as extracted above -na_count_df = data.frame(matrix(vector(mode = 'numeric' -# , length = length(na_var) - ) - , nrow = my_nrows -# , ncol = length(na_var) - )) - -# populate the df with the indices of the cols that are NA -for (i in na_var){ - cat(i) - na_i = which(is.na(meta_with_afor[i])) - na_count_df = cbind(na_count_df, na_i) - colnames(na_count_df)[which(na_var == i)] <- i -} - -# Now compare these indices to ensure these are the same -check2 = NULL -for ( i in 1: ( length(na_count_df)-1 ) ) { -# cat(na_count_df[i] == na_count_df[i+1]) - check_all = identical(na_count_df[[i]], na_count_df[[i+1]]) - check2 = c(check_all, check2) - if ( all(check2) ) { - cat('PASS: The indices for AF, OR, etc are all the same\n') - } else { - cat ('FAIL: Please check indices which are NA') - } -} - - - diff --git a/meta_data_analysis/combine_dfs.py b/meta_data_analysis/combine_dfs.py deleted file mode 100755 index f20215d..0000000 --- a/meta_data_analysis/combine_dfs.py +++ /dev/null @@ -1,248 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -''' -Created on Tue Aug 6 12:56:03 2019 - -@author: tanu -''' -# FIXME: change filename 4 (mcsm normalised data) -# to be consistent like (pnca_complex_mcsm_norm.csv) : changed manually, but ensure this is done in the mcsm pipeline -#============================================================================= -# Task: combine 4 dfs with aa position as linking column -# This is done in 2 steps: -# merge 1: of 3 dfs (filenames in lowercase) -# _dssp.csv -# _kd.csv -# _pnca_rd.csv - -# merge 2: of 2 dfs -# pnca_complex_mcsm_norm.csv (!fix name in mcsm script) -# output df from merge1 - -# Input: 3 dfs -# _dssp.csv -# _kd.csv -# _pnca_rd.csv -# pnca_complex_mcsm_norm.csv (!fix name in mcsm script) - -# Output: .csv of all 4 dfs combined - -# useful link -# https://stackoverflow.com/questions/23668427/pandas-three-way-joining-multiple-dataframes-on-columns -#============================================================================= -#%% load packages -import sys, os -import pandas as pd -#import numpy as np -import argparse -#======================================================================= -#%% specify input and curr dir -homedir = os.path.expanduser('~') - -# set working dir -os.getcwd() -os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis') -os.getcwd() -#======================================================================= -#%% command line args -arg_parser = argparse.ArgumentParser() -#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide') -#arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive -arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazin') -arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pn') # case sensitive -args = arg_parser.parse_args() -#======================================================================= -#%% variable assignment: input and output -#drug = 'pyrazinamide' -#gene = 'pncA' -#gene_match = gene + '_p.' - -drug = args.drug -gene = args.gene -#========== -# data dir -#========== -datadir = homedir + '/' + 'git/Data' - -#======= -# input -#======= -indir = datadir + '/' + drug + '/' + 'output' -in_filename1 = 'pnca_dssp.csv' -in_filename2 = 'pnca_kd.csv' -in_filename3 = 'pnca_rd.csv' -#in_filename4 = 'mcsm_complex1_normalised.csv' # Fix name in mcsm script -in_filename4 = 'pnca_complex_mcsm_norm.csv' # manually changed temporarily - -infile1 = indir + '/' + in_filename1 -infile2 = indir + '/' + in_filename2 -infile3 = indir + '/' + in_filename3 -infile4 = indir + '/' + in_filename4 - -print('\nInput path:', indir - , '\nInput filename1:', in_filename1 - , '\nInput filename2:', in_filename2 - , '\nInput filename3:', in_filename3 - , '\nInput filename4:', in_filename4 - , '\n===================================================================') - -#======= -# output -#======= -outdir = datadir + '/' + drug + '/' + 'output' -out_filename = gene.lower() + '_mcsm_struct_params.csv' -outfile = outdir + '/' + out_filename -print('Output filename:', out_filename - , '\nOutput path:', outdir - , '\n===================================================================') - -#%% end of variable assignment for input and output files -#======================================================================= -#%% Read input file -dssp_df = pd.read_csv(infile1, sep = ',') -kd_df = pd.read_csv(infile2, sep = ',') -rd_df = pd.read_csv(infile3, sep = ',') -mcsm_df = pd.read_csv(infile4, sep = ',') - -print('Reading input files:' - , '\ndssp file:', infile1 - , '\nNo. of rows:', len(dssp_df) - , '\nNo. of cols:', len(dssp_df.columns) - , '\nColumn names:', dssp_df.columns - , '\n===================================================================' - , '\nkd file:', infile2 - , '\nNo. of rows:', len(kd_df) - , '\nNo. of cols:', len(kd_df.columns) - , '\nColumn names:', kd_df.columns - , '\n===================================================================' - , '\nrd file:', infile3 - , '\nNo. of rows:', len(rd_df) - , '\nNo. of cols:', len(rd_df.columns) - , '\nColumn names:', rd_df.columns - , '\n===================================================================' - , '\nrd file:', infile4 - , '\nNo. of rows:', len(mcsm_df) - , '\nNo. of cols:', len(mcsm_df.columns) - , '\nColumn names:', mcsm_df.columns - , '\n===================================================================') -#%% Begin combining dfs -#=================== -# concatenating df1 (3dfs): dssp_df + kd_df+ rd_df -#=================== -print('starting first merge...\n') - -# checking no. of rows -print('Checking if no. of rows of the 3 dfs are equal:\n' - , len(dssp_df) == len(kd_df) == len(rd_df) - , '\nReason: fasta files and pdb files vary since not all pos are part of the structure' - , '\n===================================================================') - -# variables for sanity checks -expected_rows_df1 = max(len(dssp_df), len(kd_df), len(rd_df)) -# beware of harcoding! used for sanity check -ndfs = 3 -ncol_merge = 1 -offset = ndfs- ncol_merge -expected_cols_df1 = len(dssp_df.columns) + len(kd_df.columns) + len(rd_df.columns) - offset - -print('Merge 1:' - , '\ncombining 3dfs by commom col: position' - , '\nExpected nrows in combined_df:', expected_rows_df1 - , '\nExpected ncols in combined_df:', expected_cols_df1 - , '\nResetting the common col as the index' - , '\n===================================================================') - -#dssp_df.set_index('position', inplace = True) -#kd_df.set_index('position', inplace = True) -#rd_df.set_index('position', inplace =True) - -#combined_df = pd.concat([dssp_df, kd_df, rd_df], axis = 1, sort = False).reset_index() -#combined_df.rename(columns = {'index':'position'}) - -combined_df1 = pd.concat( - (my_index.set_index('position') for my_index in [dssp_df, kd_df, rd_df]) - , axis = 1, join = 'outer').reset_index() - -# sanity check -print('Checking dimensions of concatenated df1...') -if len(combined_df1) == expected_rows_df1 and len(combined_df1.columns) == expected_cols_df1: - print('PASS: combined df has expected dimensions' - , '\nNo. of rows in combined df:', len(combined_df1) - , '\nNo. of cols in combined df:', len(combined_df1.columns) - , '\n===============================================================') -else: - print('FAIL: combined df does not have expected dimensions' - , '\nNo. of rows in combined df:', len(combined_df1) - , '\nNo. of cols in combined df:', len(combined_df1.columns) - , '\n===============================================================') - -#=================== -# concatenating df2 (2dfs): mcsm_df + combined_df1 -# sort sorts the cols -#=================== -print('starting second merge...\n') - -# rename col 'Position' in mcsm_df to lowercase 'position' -# as it matches the combined_df1 colname to perfom merge -#mcsm_df.columns -#mcsm_df.rename(columns = {'Position':'position'}) # not working! -# copy 'Position' column with the correct colname -print('Firstly, copying \'Position\' col and renaming \'position\' to allow merging' - , '\nNo. of cols before copying: ', len(mcsm_df.columns)) - -mcsm_df['position'] = mcsm_df['Position'] -print('No. of cols after copying: ', len(mcsm_df.columns)) - -# sanity check -if mcsm_df['position'].equals(mcsm_df['Position']): - print('PASS: Copying worked correctly' - , '\ncopied col matches original column' - , '\n===============================================================') -else: - print('FAIL: copied col does not match original column' - , '\n================================================================') - -# variables for sanity checks -expected_rows_df2 = len(mcsm_df) -# beware of harcoding! used for sanity check -ndfs = 2 -ncol_merge = 1 -offset = ndfs - ncol_merge -expected_cols_df2 = len(mcsm_df.columns) + len(combined_df1.columns) - offset - -print('Merge 2:' - , '\ncombining 2dfs by commom col: position' - , '\nExpected nrows in combined_df:', expected_rows_df2 - , '\nExpected ncols in combined_df:', expected_cols_df2 - , '\n===================================================================') - -combined_df2 = mcsm_df.merge(combined_df1, on = 'position') - -# sanity check -print('Checking dimensions of concatenated df2...') -if len(combined_df2) == expected_rows_df2 and len(combined_df2.columns) == expected_cols_df2: - print('PASS: combined df2 has expected dimensions' - , '\nNo. of rows in combined df:', len(combined_df2) - , '\nNo. of cols in combined df:', len(combined_df2.columns) - , '\n===============================================================') -else: - print('FAIL: combined df2 does not have expected dimensions' - , '\nNo. of rows in combined df:', len(combined_df2) - , '\nNo. of cols in combined df:', len(combined_df2.columns) - , '\n===============================================================') - -#%% write file -print('Writing file:' - , '\nFilename:', out_filename - , '\nPath:', outdir - , '\n===================================================================') - -combined_df2.to_csv(outfile, header = True, index = False) - -print('Finished writing:', out_filename - , '\nNo. of rows:', len(combined_df2) - , '\nNo. of cols:', len(combined_df2.columns) - , '\n===================================================================') - -#%% end of script -#============================================================================== diff --git a/meta_data_analysis/combining_df_lig.R b/meta_data_analysis/combining_df_lig.R deleted file mode 100644 index d9b6b9a..0000000 --- a/meta_data_analysis/combining_df_lig.R +++ /dev/null @@ -1,397 +0,0 @@ -#======================================================================= -# TASK: To combine mcsm and meta data with af and or -# by filtering for distance to ligand (<10Ang). -# This script doesn't output anything. -# This script is sourced from other .R scripts for plotting ligand plots - -# Input csv files: -# 1) mcsm normalised and struct params -# 2) gene associated meta_data_with_AFandOR -#======================================================================= -#%% specify curr dir -getwd() -setwd('~/git/LSHTM_analysis/meta_data_analysis/') -getwd() -#======================================================================= -#%% load packages -#require(data.table) -#require(arsenal) -#require(compare) -#library(tidyverse) - -# header file -header_dir = '~/git/LSHTM_analysis/' -source(paste0(header_dir, '/', 'my_header.R')) -#%% variable assignment: input and output paths & filenames -#======================================================================= -drug = 'pyrazinamide' -gene = 'pncA' -gene_match = paste0(gene,'_p.') -cat(gene_match) - -#=========== -# data dir -#=========== -datadir = '~/git/Data' - -#=========== -# input -#=========== -# infile1: mCSM data -#indir = '~/git/Data/pyrazinamide/input/processed/' -indir = paste0(datadir, '/', drug, '/', 'output') # revised {TODO: change in mcsm pipeline} -#in_filename = 'mcsm_complex1_normalised.csv' -in_filename = 'pnca_mcsm_struct_params.csv' -infile = paste0(indir, '/', in_filename) -cat(paste0('Reading infile1: mCSM output file', ' ', infile) ) - -# infile2: gene associated meta data combined with AF and OR -#indir: same as above -in_filename_comb = paste0(tolower(gene), '_meta_data_with_AFandOR.csv') -infile_comb = paste0(indir, '/', in_filename_comb) -cat(paste0('Reading infile2: gene associated combined metadata:', infile_comb)) - -#=========== -# output -#=========== -# Uncomment if and when required to output -outdir = paste0(datadir, '/', drug, '/', 'output') #same as indir -cat('Output dir: ', outdir, '\n') -#out_filename = paste0(tolower(gene), 'XXX') -#outfile = paste0(outdir, '/', out_filename) -#cat(paste0('Output file with full path:', outfile)) - -#%% end of variable assignment for input and output files -#======================================================================= -#%% Read input files - -##################################### -# input file 1: mcsm normalised data -# output of step 4 mcsm_pipeline -##################################### - -cat('Reading mcsm_data:' - , '\nindir: ', indir - , '\ninfile_comb: ', in_filename) - -mcsm_data = read.csv(infile -# , row.names = 1 - , stringsAsFactors = F - , header = T) - -cat('Read mcsm_data file:' - , '\nNo.of rows: ', nrow(mcsm_data) - , '\nNo. of cols:', ncol(mcsm_data)) - -# clear variables -rm(in_filename, infile) - -str(mcsm_data) - -table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) ) - -# spelling Correction 1: DUET -mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising' -mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising' - -# checks: should be the same as above -table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) ) -head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome) - -# spelling Correction 2: Ligand -table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) ) - -mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising' -mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising' - -# checks: should be the same as above -table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) ) -head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome) - -# muts with opposing effects on protomer and ligand stability -# excluded from here as it is redundant. -# check 'combining_two_df.R' to refer if required. -#======================================================================= -#%% !!!Filter data only for mcsm lig!!! - -########################### -# Filter/subset data -# Lig plots < 10Ang -# Filter the lig plots for Dis_to_lig < 10Ang -########################### - -# check range of distances -max(mcsm_data$Dis_lig_Ang) -min(mcsm_data$Dis_lig_Ang) - -# count -table(mcsm_data$Dis_lig_Ang<10) - -# subset data to have only values less than 10 Ang -mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10) - -# sanity checks -max(mcsm_data2$Dis_lig_Ang) -min(mcsm_data2$Dis_lig_Ang) - -# count no of unique positions -length(unique(mcsm_data2$Position)) - -# count no of unique mutations -length(unique(mcsm_data2$Mutationinformation)) - -# count Destabilisinga and stabilising -table(mcsm_data2$Lig_outcome) #{RESULT: no of mutations within 10Ang} - -############################# -# Extra sanity check: -# for mcsm_lig ONLY -# Dis_lig_Ang should be <10 -############################# - -if (max(mcsm_data2$Dis_lig_Ang) < 10){ - print ("Sanity check passed: lig data is <10Ang") -}else{ - print ("Error: data should be filtered to be within 10Ang") -} - -#!!!!!!!!!!!!!!!!!!!!! -# REASSIGNMENT: so as not to alter the script -mcsm_data = mcsm_data2 -#!!!!!!!!!!!!!!!!!!!!! -#======================================================================= -# clear variables -rm(mcsm_data2) - -# count na in each column -na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count - -# sort by Mutationinformation -mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),] -head(mcsm_data$Mutationinformation) - -orig_col = ncol(mcsm_data) -# get freq count of positions and add to the df -setDT(mcsm_data)[, mut_pos_occurrence := .N, by = .(position)] - -cat('Added col: position frequency to see which posn has how many muts' - , '\nNo. of cols now', ncol(mcsm_data) - , '\nNo. of cols before: ', orig_col) - -mut_pos_occurrence = data.frame(mcsm_data$id, mcsm_data$Position, mcsm_data$mut_pos_occurrence) - -###################################### -# input file2 meta data with AFandOR -###################################### -cat('Reading combined meta data and AFandOR file:' - , '\nindir: ', indir - , '\ninfile_comb: ', in_filename_comb) - -meta_with_afor <- read.csv(infile_comb - , stringsAsFactors = F - , header = T) - -cat('Read mcsm_data file:' - , '\nNo.of rows: ', nrow(meta_with_afor) - , '\nNo. of cols:', ncol(meta_with_afor)) - -# clear variables -rm(in_filename_comb, infile_comb) - -str(meta_with_afor) - -# sort by Mutationinformation -head(meta_with_afor$Mutationinformation) -meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),] -head(meta_with_afor$Mutationinformation) - -orig_col2 = ncol(meta_with_afor) - -# get freq count of positions and add to the df -setDT(meta_with_afor)[, sample_pos_occurrence := .N, by = .(Position)] - -cat('Added col: position frequency of samples to check' - ,'how many samples correspond to a partiulcar posn associated with muts' - , '\nNo. of cols now', ncol(meta_with_afor) - , '\nNo. of cols before: ', orig_col2) - -sample_pos_occurrence = data.frame(meta_with_afor$id, meta_with_afor$position, meta_with_afor$sample_pos_occurrence) - -#======================================================================= -cat('Begin merging dfs with NAs' - , '\n===============================================================') - -########################### -# merging two dfs: with NA -########################### -# link col name = 'Mutationinforamtion' -cat('Merging dfs with NAs: big df (1-many relationship b/w id & mut)' - ,'\nlinking col: Mutationinforamtion' - ,'\nfilename: merged_df2') - -head(mcsm_data$Mutationinformation) -head(meta_with_afor$Mutationinformation) - -######### -# a) merged_df2: meta data with mcsm -######### -merged_df2 = merge(x = meta_with_afor - , y = mcsm_data - , by = 'Mutationinformation' - , all.y = T) - -cat('Dim of merged_df2: ' - , '\nNo. of rows: ', nrow(merged_df2) - , '\nNo. of cols: ', ncol(merged_df2)) -head(merged_df2$Position) - -if(nrow(meta_with_afor) == nrow(merged_df2)){ - cat('nrow(merged_df2) = nrow (gene associated metadata)' - ,'\nExpected no. of rows: ',nrow(meta_with_afor) - ,'\nGot no. of rows: ', nrow(merged_df2)) -} else{ - cat('nrow(merged_df2)!= nrow(gene associated metadata)' - , '\nExpected no. of rows after merge: ', nrow(meta_with_afor) - , '\nGot no. of rows: ', nrow(merged_df2) - , '\nFinding discrepancy') - merged_muts_u = unique(merged_df2$Mutationinformation) - meta_muts_u = unique(meta_with_afor$Mutationinformation) - # find the index where it differs - unique(meta_muts_u[! meta_muts_u %in% merged_muts_u]) -} - -# sort by Position -head(merged_df2$Position) -merged_df2 = merged_df2[order(merged_df2$Position),] -head(merged_df2$Position) - -merged_df2v2 = merge(x = meta_with_afor - , y = mcsm_data - , by = 'Mutationinformation' - , all.x = T) -#!=!=!=!=!=!=!=! -# COMMENT: used all.y since position 186 is not part of the struc, -# hence doesn't have a mcsm value -# but 186 is associated with mutation -#!=!=!=!=!=!=!=! - -# should be False -identical(merged_df2, merged_df2v2) -table(merged_df2$Position%in%merged_df2v2$Position) - -rm(merged_df2v2) - -######### -# b) merged_df3: remove duplicate mutation information -######### -cat('Merging dfs with NAs: small df (removing duplicate muts)' - ,'\nCannot trust lineage info from this' - ,'\nlinking col: Mutationinforamtion' - ,'\nfilename: merged_df3') - -#==#=#=#=#=#=# -# Cannot trust lineage, country from this df as the same mutation -# can have many different lineages -# but this should be good for the numerical corr plots -#=#=#=#=#=#=#= -merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),] -head(merged_df3$Position); tail(merged_df3$Position) # should be sorted - -# sanity checks -# nrows of merged_df3 should be the same as the nrows of mcsm_data -if(nrow(mcsm_data) == nrow(merged_df3)){ - cat('PASS: No. of rows match with mcsm_data' - ,'\nExpected no. of rows: ', nrow(mcsm_data) - ,'\nGot no. of rows: ', nrow(merged_df3)) -} else { - cat('FAIL: No. of rows mismatch' - , '\nNo. of rows mcsm_data: ', nrow(mcsm_data) - , '\nNo. of rows merged_df3: ', nrow(merged_df3)) -} - -# counting NAs in AF, OR cols in merged_df3 -# this is becuase mcsm has no AF, OR cols, -# so you cannot count NAs -if (identical(sum(is.na(merged_df3$OR)) - , sum(is.na(merged_df3$pvalue)) - , sum(is.na(merged_df3$AF)))){ - cat('PASS: NA count match for OR, pvalue and AF\n') - na_count_df3 = sum(is.na(merged_df3$AF)) - cat('No. of NAs: ', sum(is.na(merged_df3$OR))) -} else{ - cat('FAIL: NA count mismatch' - , '\nNA in OR: ', sum(is.na(merged_df3$OR)) - , '\nNA in pvalue: ', sum(is.na(merged_df3$pvalue)) - , '\nNA in AF:', sum(is.na(merged_df3$AF))) -} -#======================================================================= -#%% merging without NAs - -cat('Begin merging dfs without NAs' - , '\n===============================================================') - -cat('Merging dfs without any NAs: big df (1-many relationship b/w id & mut)' - ,'\nlinking col: Mutationinforamtion' - ,'\nfilename: merged_df2_comp') - -######### -# c) merged_df2_comp: merging two dfs without NA -######### -merged_df2_comp = merged_df2[!is.na(merged_df2$AF),] -#merged_df2_comp = merged_df2[!duplicated(merged_df2$Mutationinformation),] - -# sanity check -cat('Checking nrows in merged_df2_comp') -if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count + 1)){ - cat('PASS: No. of rows match' - ,'\nDim of merged_df2_comp: ' - ,'\nExpected no. of rows: ', nrow(merged_df2) - na_count + 1 - , '\nNo. of rows: ', nrow(merged_df2_comp) - , '\nNo. of cols: ', ncol(merged_df2_comp)) -}else{ - cat('FAIL: No. of rows mismatch' - ,'\nExpected no. of rows: ', nrow(merged_df2) - na_count + 1 - ,'\nGot no. of rows: ', nrow(merged_df2_comp)) -} - -######### -# d) merged_df3_comp: remove duplicate mutation information -######### -merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),] - -cat('Dim of merged_df3_comp: ' - , '\nNo. of rows: ', nrow(merged_df3_comp) - , '\nNo. of cols: ', ncol(merged_df3_comp)) - -# alternate way of deriving merged_df3_comp -foo = merged_df3[!is.na(merged_df3$AF),] -# compare dfs: foo and merged_df3_com -all.equal(foo, merged_df3) - -summary(comparedf(foo, merged_df3)) - -# sanity check -cat('Checking nrows in merged_df3_comp') -if(nrow(merged_df3_comp) == nrow(merged_df3)){ - cat('NO NAs detected in merged_df3 in AF|OR cols' - ,'\nNo. of rows are identical: ', nrow(merged_df3)) -} else{ - if(nrow(merged_df3_comp) == nrow(merged_df3) - na_count_df3) { - cat('PASS: NAs detected in merged_df3 in AF|OR cols' - , '\nNo. of NAs: ', na_count_df3 - , '\nExpected no. of rows in merged_df3_comp: ', nrow(merged_df3) - na_count_df3 - , '\nGot no. of rows: ', nrow(merged_df3_comp)) - } -} - -#======================================================================= -# write_output files -# Not required as this is a subset of the combining_two_df.R -#************************* -# clear variables -rm(mcsm_data, meta_with_afor, foo, drug, gene, gene_match, indir, merged_muts_u, meta_muts_u, na_count, orig_col, outdir) -rm(mut_pos_occurrence) -#%% end of script -#======================================================================= - - diff --git a/meta_data_analysis/combining_df_ps.R b/meta_data_analysis/combining_df_ps.R deleted file mode 100644 index 770661d..0000000 --- a/meta_data_analysis/combining_df_ps.R +++ /dev/null @@ -1,461 +0,0 @@ -#======================================================================= -# TASK: To combine mcsm and meta data with af and or files -# Input csv files: -# 1) mcsm normalised and struct params -# 2) gene associated meta_data_with_AFandOR - -# Output: -# 1) muts with opposite effects on stability -# 2) large combined df including NAs for AF, OR,etc -# Dim: same no. of rows as gene associated meta_data_with_AFandOR -# 3) small combined df including NAs for AF, OR, etc. -# Dim: same as mcsm data -# 4) large combined df excluding NAs -# Dim: dim(#1) - no. of NAs(AF|OR) + 1 -# 5) small combined df excluding NAs -# Dim: dim(#2) - no. of unique NAs - 1 -# This script is sourced from other .R scripts for plotting -#======================================================================= -#%% specify curr dir -getwd() -setwd('~/git/LSHTM_analysis/meta_data_analysis/') -getwd() -#======================================================================= -#%% load packages -#require(data.table) -#require(arsenal) -#require(compare) -#library(tidyverse) - -# header file -header_dir = '~/git/LSHTM_analysis/' -source(paste0(header_dir, '/', 'my_header.R')) -#======================================================================= -#%% variable assignment: input and output paths & filenames -drug = 'pyrazinamide' -gene = 'pncA' -gene_match = paste0(gene,'_p.') -cat(gene_match) - -#=========== -# data dir -#=========== -datadir = '~/git/Data' - -#=========== -# input -#=========== -# infile1: mCSM data -#indir = '~/git/Data/pyrazinamide/input/processed/' -indir = paste0(datadir, '/', drug, '/', 'output') # revised {TODO: change in mcsm pipeline} -#in_filename = 'mcsm_complex1_normalised.csv' -in_filename = 'pnca_mcsm_struct_params.csv' -infile = paste0(indir, '/', in_filename) -cat(paste0('Reading infile1: mCSM output file', ' ', infile, '\n') ) - -# infile2: gene associated meta data combined with AF and OR -#indir: same as above -in_filename_comb = paste0(tolower(gene), '_meta_data_with_AFandOR.csv') -infile_comb = paste0(indir, '/', in_filename_comb) -cat(paste0('Reading infile2: gene associated combined metadata:', infile_comb, '\n')) - -#=========== -# output -#=========== -# Uncomment if and when required to output -outdir = paste0(datadir, '/', drug, '/', 'output') #same as indir -cat('Output dir: ', outdir, '\n') -#out_filename = paste0(tolower(gene), 'XXX') -#outfile = paste0(outdir, '/', out_filename) -#cat(paste0('Output file with full path:', outfile)) - -#%% end of variable assignment for input and output files -#======================================================================= -#%% Read input files - -##################################### -# input file 1: mcsm normalised data -# output of step 4 mcsm_pipeline -##################################### -cat('Reading mcsm_data:' - , '\nindir: ', indir - , '\ninfile_comb: ', in_filename) - -mcsm_data = read.csv(infile -# , row.names = 1 - , stringsAsFactors = F - , header = T) - -cat('Read mcsm_data file:' - , '\nNo.of rows: ', nrow(mcsm_data) - , '\nNo. of cols:', ncol(mcsm_data)) - -# clear variables -rm(in_filename, infile) - -str(mcsm_data) - -table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) ) - -# spelling Correction 1: DUET -mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising' -mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising' - -# checks: should be the same as above -table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) ) -head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome) - -# spelling Correction 2: Ligand -table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) ) - -mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising' -mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising' - -# checks: should be the same as above -table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) ) -head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome) - -# muts with opposing effects on protomer and ligand stability -table(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome) -changes = mcsm_data[which(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome),] - -# sanity check: redundant, but uber cautious! -dl_i = which(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome) -ld_i = which(mcsm_data$Lig_outcome != mcsm_data$DUET_outcome) - -cat('Identifying muts with opposite stability effects') -if(nrow(changes) == (table(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome)[[2]]) & identical(dl_i,ld_i)) { - cat('PASS: muts with opposite effects on stability and affinity correctly identified' - , '\nNo. of such muts: ', nrow(changes)) -}else { - cat('FAIL: unsuccessful in extracting muts with changed stability effects') -} - -#*************************** -# write file: changed muts -out_filename = 'muts_opp_effects.csv' -outfile = paste0(outdir, '/', out_filename) -cat('Writing file for muts with opp effects:' - , '\nFilename: ', outfile - , '\nPath: ', outdir) - -write.csv(changes, outfile) -#**************************** -# clear variables -rm(out_filename, outfile) -rm(changes, dl_i, ld_i) - -# count na in each column -na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count - -# sort by Mutationinformation -mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),] -head(mcsm_data$Mutationinformation) - -orig_col = ncol(mcsm_data) - -# get freq count of positions and add to the df -setDT(mcsm_data)[, mut_pos_occurrence := .N, by = .(Position)] - -cat('Added col: position frequency of muts to see which posn has how many muts' - , '\nNo. of cols now', ncol(mcsm_data) - , '\nNo. of cols before: ', orig_col) - -mut_pos_occurrence = data.frame(mcsm_data$Mutationinformation - , mcsm_data$Position - , mcsm_data$mut_pos_occurrence) - -colnames(mut_pos_occurrence) = c('Mutationinformation', 'position', 'mut_pos_occurrence') -####################################### -# input file 2: meta data with AFandOR -####################################### -cat('Reading combined meta data and AFandOR file:' - , '\nindir: ', indir - , '\ninfile_comb: ', in_filename_comb) - -meta_with_afor <- read.csv(infile_comb - , stringsAsFactors = F - , header = T) - -cat('Read mcsm_data file:' - , '\nNo.of rows: ', nrow(meta_with_afor) - , '\nNo. of cols:', ncol(meta_with_afor)) - -# counting NAs in AF, OR cols -if (identical(sum(is.na(meta_with_afor$OR)) - , sum(is.na(meta_with_afor$pvalue)) - , sum(is.na(meta_with_afor$AF)))){ - cat('PASS: NA count match for OR, pvalue and AF\n') - na_count = sum(is.na(meta_with_afor$AF)) - cat('No. of NAs: ', sum(is.na(meta_with_afor$OR))) -} else{ - cat('FAIL: NA count mismatch' - , '\nNA in OR: ', sum(is.na(meta_with_afor$OR)) - , '\nNA in pvalue: ', sum(is.na(meta_with_afor$pvalue)) - , '\nNA in AF:', sum(is.na(meta_with_afor$AF))) -} - -# clear variables -rm(in_filename_comb, infile_comb) - -str(meta_with_afor) - -# sort by Mutationinformation -head(meta_with_afor$Mutationinformation) -meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),] -head(meta_with_afor$Mutationinformation) - -orig_col2 = ncol(meta_with_afor) - -# get freq count of positions and add to the df -setDT(meta_with_afor)[, sample_pos_occurrence := .N, by = .(position)] - -cat('Added col: position frequency of samples to check' - ,'how many samples correspond to a partiulcar posn associated with muts' - , '\nNo. of cols now', ncol(meta_with_afor) - , '\nNo. of cols before: ', orig_col2) - -sample_pos_occurrence = data.frame(meta_with_afor$id - , meta_with_afor$mutation - , meta_with_afor$Mutationinformation - , meta_with_afor$position - , meta_with_afor$sample_pos_occurrence) -colnames(sample_pos_occurrence) = c('id', 'mutation', 'Mutationinformation', 'position', 'sample_pos_occurrence') -#======================================================================= -cat('Begin merging dfs with NAs' - , '\n===============================================================') - -########################### -# merging two dfs: with NA -########################### -# link col name = 'Mutationinforamtion' -head(mcsm_data$Mutationinformation) -head(meta_with_afor$Mutationinformation) - -cat('Merging dfs with NAs: big df (1-many relationship b/w id & mut)' - ,'\nlinking col: Mutationinforamtion' - ,'\nfilename: merged_df2') - -######### -# a) merged_df2: meta data with mcsm -######### -merged_df2 = merge(x = meta_with_afor - ,y = mcsm_data - , by = 'Mutationinformation' - , all.y = T) - -cat('Dim of merged_df2: ' - , '\nNo. of rows: ', nrow(merged_df2) - , '\nNo. of cols: ', ncol(merged_df2)) -head(merged_df2$Position) - -# sanity check -cat('Checking nrows in merged_df2') -if(nrow(meta_with_afor) == nrow(merged_df2)){ - cat('nrow(merged_df2) = nrow (gene associated metadata)' - ,'\nExpected no. of rows: ',nrow(meta_with_afor) - ,'\nGot no. of rows: ', nrow(merged_df2)) -} else{ - cat('nrow(merged_df2)!= nrow(gene associated metadata)' - , '\nExpected no. of rows after merge: ', nrow(meta_with_afor) - , '\nGot no. of rows: ', nrow(merged_df2) - , '\nFinding discrepancy') - merged_muts_u = unique(merged_df2$Mutationinformation) - meta_muts_u = unique(meta_with_afor$Mutationinformation) - # find the index where it differs - unique(meta_muts_u[! meta_muts_u %in% merged_muts_u]) -} - -# sort by Position -head(merged_df2$Position) -merged_df2 = merged_df2[order(merged_df2$Position),] -head(merged_df2$Position) - -merged_df2v2 = merge(x = meta_with_afor - ,y = mcsm_data - , by = 'Mutationinformation' - , all.x = T) -#!=!=!=!=!=!=!=! -# COMMENT: used all.y since position 186 is not part of the struc, -# hence doesn't have a mcsm value -# but 186 is associated with mutation -#!=!=!=!=!=!=!=! - -# should be False -identical(merged_df2, merged_df2v2) -table(merged_df2$Position%in%merged_df2v2$Position) - -rm(merged_df2v2) - -######### -# b) merged_df3:remove duplicate mutation information -######### -cat('Merging dfs without NAs: small df (removing muts with no AF|OR associated)' - ,'\nCannot trust lineage info from this' - ,'\nlinking col: Mutationinforamtion' - ,'\nfilename: merged_df3') - -#==#=#=#=#=#=# -# Cannot trust lineage, country from this df as the same mutation -# can have many different lineages -# but this should be good for the numerical corr plots -#=#=#=#=#=#=#= -merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),] -head(merged_df3$Position); tail(merged_df3$Position) # should be sorted - -# sanity check -cat('Checking nrows in merged_df3') -if(nrow(mcsm_data) == nrow(merged_df3)){ - cat('PASS: No. of rows match with mcsm_data' - ,'\nExpected no. of rows: ', nrow(mcsm_data) - ,'\nGot no. of rows: ', nrow(merged_df3)) -} else { - cat('FAIL: No. of rows mismatch' - , '\nNo. of rows mcsm_data: ', nrow(mcsm_data) - , '\nNo. of rows merged_df3: ', nrow(merged_df3)) -} - -# counting NAs in AF, OR cols in merged_df3 -# this is becuase mcsm has no AF, OR cols, -# so you cannot count NAs -if (identical(sum(is.na(merged_df3$OR)) - , sum(is.na(merged_df3$pvalue)) - , sum(is.na(merged_df3$AF)))){ - cat('PASS: NA count match for OR, pvalue and AF\n') - na_count_df3 = sum(is.na(merged_df3$AF)) - cat('No. of NAs: ', sum(is.na(merged_df3$OR))) -} else{ - cat('FAIL: NA count mismatch' - , '\nNA in OR: ', sum(is.na(merged_df3$OR)) - , '\nNA in pvalue: ', sum(is.na(merged_df3$pvalue)) - , '\nNA in AF:', sum(is.na(merged_df3$AF))) -} -#======================================================================= -#%% merging without NAs - -cat('Begin merging dfs without NAs' - , '\n===============================================================') - -cat('Merging dfs without any NAs: big df (1-many relationship b/w id & mut)' - ,'\nlinking col: Mutationinforamtion' - ,'\nfilename: merged_df2_comp') - -######### -# c) merged_df2_comp: same as merge 1 but excluding NA -######### -merged_df2_comp = merged_df2[!is.na(merged_df2$AF),] -#merged_df2_comp = merged_df2[!duplicated(merged_df2$Mutationinformation),] - -# sanity check -cat('Checking nrows in merged_df2_comp') -if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count + 1)){ - cat('PASS: No. of rows match' - ,'\nDim of merged_df2_comp: ' - ,'\nExpected no. of rows: ', nrow(merged_df2) - na_count + 1 - , '\nNo. of rows: ', nrow(merged_df2_comp) - , '\nNo. of cols: ', ncol(merged_df2_comp)) -}else{ - cat('FAIL: No. of rows mismatch' - ,'\nExpected no. of rows: ', nrow(merged_df2) - na_count + 1 - ,'\nGot no. of rows: ', nrow(merged_df2_comp)) -} - -######### -# d) merged_df3_comp: remove duplicate mutation information -######### -merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),] - -cat('Dim of merged_df3_comp: ' - , '\nNo. of rows: ', nrow(merged_df3_comp) - , '\nNo. of cols: ', ncol(merged_df3_comp)) - -# alternate way of deriving merged_df3_comp -foo = merged_df3[!is.na(merged_df3$AF),] -# compare dfs: foo and merged_df3_com -all.equal(foo, merged_df3) - -summary(comparedf(foo, merged_df3)) - -# sanity check -cat('Checking nrows in merged_df3_comp') -if(nrow(merged_df3_comp) == nrow(merged_df3)){ - cat('NO NAs detected in merged_df3 in AF|OR cols' - ,'\nNo. of rows are identical: ', nrow(merged_df3)) -} else{ - if(nrow(merged_df3_comp) == nrow(merged_df3) - na_count_df3) { - cat('PASS: NAs detected in merged_df3 in AF|OR cols' - , '\nNo. of NAs: ', na_count_df3 - , '\nExpected no. of rows in merged_df3_comp: ', nrow(merged_df3) - na_count_df3 - , '\nGot no. of rows: ', nrow(merged_df3_comp)) - } -} -#======================================================================= -#********************* -# writing 1 file in the style of a loop: merged_df3 -# print(output dir) -#i = 'merged_df3' -#out_filename = paste0(i, '.csv') -#outfile = paste0(outdir, '/', out_filename) - -#cat('Writing output file: ' -# ,'\nFilename: ', out_filename -# ,'\nPath: ', outdir) - -#template: write.csv(merged_df3, 'merged_df3.csv') -#write.csv(get(i), outfile, row.names = FALSE) -#cat('Finished writing: ', outfile -# , '\nNo. of rows: ', nrow(get(i)) -# , '\nNo. of cols: ', ncol(get(i))) - -#%% write_output files; all 4 files: -outvars = c('merged_df2' - , 'merged_df3' - , 'merged_df2_comp' - , 'merged_df3_comp') - -cat('Writing output files: ' - , '\nPath:', outdir) - -for (i in outvars){ -# cat(i, '\n') - out_filename = paste0(i, '.csv') -# cat(out_filename, '\n') -# cat('getting value of variable: ', get(i)) - outfile = paste0(outdir, '/', out_filename) -# cat('Full output path: ', outfile, '\n') - cat('Writing output file:' - ,'\nFilename: ', out_filename,'\n') - write.csv(get(i), outfile, row.names = FALSE) - cat('Finished writing: ', outfile - , '\nNo. of rows: ', nrow(get(i)) - , '\nNo. of cols: ', ncol(get(i)), '\n') -} - -# alternate way to replace with implicit loop -# FIXME -#sapply(outvars, function(x, y) write.csv(get(outvars), paste0(outdir, '/', outvars, '.csv'))) - -#======================================================================= -#%% merging mut_pos_occurrence and sample_pos_occurence -# FIXME -#cat('Merging dfs with positional frequency from mcsm and meta_data' -# , '\nNcol in mut_pos_occurrence:', ncol(mut_pos_occurrence) -# , '\nncol in sample_pos_occurence:', ncol(sample_pos_occurrence) -# ,'\nlinking col:', intersect(colnames(sample_pos_occurrence), colnames(mut_pos_occurrence)) -# ,'\nfilename: merged_df4') - -#merged_df4 = merge(sample_pos_occurrence, mut_pos_occurrence -# , by = 'position' -# , all = T) - -#out_filename4 = 'mut_and_sample_freq.csv' -#outfile4 = paste0(outdir, '/', out_filename4) - -#************************* -# clear variables -rm(mcsm_data, meta_with_afor, foo, drug, gene, gene_match, indir, merged_muts_u, meta_muts_u, na_count, orig_col, outdir) -rm(mut_pos_occurrence, sample_pos_occurrence) -#rm(merged_df4) -#%% end of script -#======================================================================= - diff --git a/meta_data_analysis/dssp_df.py b/meta_data_analysis/dssp_df.py deleted file mode 100755 index a12f7d5..0000000 --- a/meta_data_analysis/dssp_df.py +++ /dev/null @@ -1,170 +0,0 @@ -#!/home/tanu/anaconda3/envs/ContactMap/bin/python3 -# -*- coding: utf-8 -*- -""" -Created on Tue Feb 18 10:10:12 2020 - -@author: tanu -""" -#======================================================================= -# Task: Read a DSSP file into a data frame and output to a csv file - -# Input: '.dssp' i.e gene associated.dssp file (output from run_dssp.sh) - -# Output: '.csv' file containing DSSP output as a df with ASA, RSA, etc. - # based on Tien at al 2013 (theor.) values - -# useful links: -# https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html -# https://en.wikipedia.org/wiki/Relative_accessible_surface_area -#======================================================================= -#%% load packages -import sys, os -import re -import pandas as pd -from Bio.PDB import PDBParser -from Bio.PDB.DSSP import DSSP -import pprint as pp -import dms_tools2 -import dms_tools2.dssp - -#=======================================================================# -#%% specify homedir and curr dir -homedir = os.path.expanduser('~') - -# set working dir -os.getcwd() -os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis') -os.getcwd() -#======================================================================= -#%% variable assignment: input and output -#drug = 'pyrazinamide' -#gene = 'pncA' -#gene_match = gene + '_p.' - -drug = 'cycloserine' -gene = 'alr' -#========== -# data dir -#========== -#indir = 'git/Data/pyrazinamide/input/original' -datadir = homedir + '/' + 'git/Data' - -#======= -# input from outdir -#======= -indir = datadir + '/' + drug + '/' + 'input' -#1) pdb file -in_filename_pdb = gene.lower() + '_complex' + '.pdb' -infile_pdb = indir + '/' + in_filename_pdb -print('Input pdb filename:', in_filename_pdb - , '\npath:', indir - , '\n============================================================') - -#2) dssp file -outdir = datadir + '/' + drug + '/' + 'output' -in_filename = gene.lower() +'.dssp' -infile = outdir + '/' + in_filename -print('Input dssp filename:', in_filename - , '\npath:', outdir - , '\n============================================================') - -#======= -# output -#======= -outdir = datadir + '/' + drug + '/' + 'output' -out_filename = gene.lower() + '_dssp.csv' -outfile = outdir + '/' + out_filename -print('Output filename:', out_filename - , '\npath:', outdir - , '\nOutfile: ', outfile - , '\n=============================================================') -#%% end of variable assignment for input and output files -#======================================================================= -#%% specify pdb chain as a list. Handy when more than 1 pdb chain -my_chains = ['A'] -# my_chains = ['A', 'B'] # for cycloserine - -# generate my_chains from dssp -p = PDBParser() -structure = p.get_structure(in_filename_pdb, infile_pdb) -model = structure[0] -dssp = DSSP(model, infile_pdb) - -#print(dssp.keys()) -#print(dssp.keys()[0][0]) -#print(len(dssp)) -#print(dssp.keys()[0][0]) -#print(dssp.keys()[len(dssp)-1][0]) - -dssp_chains = [] -for num_aa in range(0, len(dssp)): -# print(num_aa) -# extract the chain id only and append to a list - dssp_chains.append(dssp.keys()[num_aa][0]) - chainsL = list(set(dssp_chains)) - -print(chainsL) -# sort the list to make for sanity (since sets are not ordered) -# this will be required for dssp_df -my_chains = sorted(chainsL) - -print('dssp output for' - , in_filename, 'contains:', len(my_chains) - , 'chains:\n', my_chains) -#%% ==================================================================== -# Process dssp output and extract into df (single chain) - -#dssp_df = dms_tools2.dssp.processDSSP(infile, chain = my_chains) -#dssp_df['chain_dssp'] = my_chains -#pp.pprint(dssp_df) -#======================================================================= -# incase pdb has > 1 chain and you need to run it for all chains - -# initialise an empty df -dssp_df = pd.DataFrame() -print('Total no. of chains: ', len(my_chains)) -for chain_id in my_chains: - print('Chain id:', chain_id) - dssp_cur = pd.DataFrame() - dssp_cur = dms_tools2.dssp.processDSSP(infile, chain = chain_id) - #!!!Important!!! - dssp_cur['chain_id'] = chain_id - dssp_df = dssp_df.append(dssp_cur) - pp.pprint(dssp_df) - -#===================== -# Renaming amino-acid -# and site columns -#===================== -# Rename column (amino acid) as 'wild_type' and (site} as 'position' -# to be the same names as used in the file required for merging later. -dssp_df.columns -dssp_df.rename(columns = {'site':'position', 'amino_acid':'wild_type_dssp'}, inplace = True) -dssp_df.columns - -# sanity check -if len(dssp_df) == len(dssp): - print('PASS: length of dssp_df has correct length') -else: - print('FAIL: length mismatch for dssp_df' - , '\nexpected length:', len(dssp) - , '\nGot length:', len(dssp_df) - , 'Debug please!') - -#%% Write ouput csv file -print('Writing file:', outfile - , '\nFilename:', out_filename - , '\nPath:', outdir - , '\n=============================================================') - -# write to csv -dssp_df.to_csv(outfile, header=True, index = False) - -print('Finished writing:', out_filename - , '\nNo. of rows:', len(dssp_df) - , '\nNo. of cols:', len(dssp_df.columns) - , '\n==============================================================') -#%% end of script -#======================================================================= -#%% run dssp to extract chain ids to later process the dssp output into a df - diff --git a/meta_data_analysis/init_data_dirs.py b/meta_data_analysis/init_data_dirs.py deleted file mode 100755 index 5037258..0000000 --- a/meta_data_analysis/init_data_dirs.py +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/python3 -# Initialise a blank 'Data' directory and drug subdirs etc. -# TODO: -# - Read base dir from config file -# - Create eg: '~/git/Data/{original,processed} -# - Create eg: '~/git/Data/processed/' + drug (for each drug) -# - Create eg: '~/git/Data/output/' + drug + '{plots, structure}' - - -#%% specify homedir as python doesn't recognise tilde -homedir = os.path.expanduser('~') - -#%% variable assignment: input and output paths & filenames -drug = 'pyrazinamide' -gene = 'pncA' -gene_match = gene + '_p.' - -#========== -# data dir -#========== -#indir = 'git/Data/pyrazinamide/input/original' -datadir = homedir + '/' + 'git/Data' - -#========== -# input dir -#========== -indir = datadir + '/' + drug + '/' + 'input' - -#============ -# output dir -#============ -# several output files -outdir = datadir + '/' + drug + '/' + 'output' - -#%%end of variable assignment for input and output files -#============================================================================== diff --git a/meta_data_analysis/kd_df.py b/meta_data_analysis/kd_df.py deleted file mode 100644 index c08f157..0000000 --- a/meta_data_analysis/kd_df.py +++ /dev/null @@ -1,193 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -''' -Created on Tue Aug 6 12:56:03 2019 - -@author: tanu -''' -#======================================================================= -# Task: Hydrophobicity (Kd) values for amino acid sequence using the -# Kyt&-Doolittle. -# Same output as using the expasy server https://web.expasy.org/protscale/ -# Input: fasta file - -# Output: csv file with - - -# useful links -# https://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParamData-pysrc.html -# https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html -#======================================================================= -#%% load packages -from pylab import * -from Bio.SeqUtils import ProtParamData -from Bio.SeqUtils.ProtParam import ProteinAnalysis -from Bio import SeqIO -#from Bio.Alphabet.IUPAC import IUPACProtein -#import pprint as pp -import pandas as pd -import numpy as np -import sys, os -#======================================================================= -#%% specify input and curr dir -homedir = os.path.expanduser('~') - -# set working dir -os.getcwd() -os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis') -os.getcwd() -#======================================================================= -#%% variable assignment: input and output -drug = 'pyrazinamide' -gene = 'pncA' -gene_match = gene + '_p.' - -#========== -# data dir -#========== -#indir = 'git/Data/pyrazinamide/input/original' -datadir = homedir + '/' + 'git/Data' - -#======= -# input -#======= -#indir = 'git/Data/pyrazinamide/input/original' -indir = datadir + '/' + drug + '/' + 'input' -in_filename = '3pl1.fasta.txt' -infile = indir + '/' + in_filename -print('Input filename:', in_filename - , '\nInput path:', indir - , '\n============================================================') - -#======= -# output -#======= -outdir = datadir + '/' + drug + '/' + 'output' -out_filename = gene.lower() + '_kd.csv' -outfile = outdir + '/' + out_filename -print('Output filename:', out_filename - , '\nOutput path:', outdir - , '\n=============================================================') -#%% end of variable assignment for input and output files -#======================================================================= -#=================== -#calculate KD values: same as the expasy server -#=================== -#%%specify window size for hydropathy profile computation -# https://web.expasy.org/protscale/pscale/protscale_help.html -my_window = 3 -offset = round((my_window/2)-0.5) - -fh = open(infile) - -for record in SeqIO.parse(fh, 'fasta'): - id = record.id - seq = record.seq - num_residues = len(seq) -fh.close() - -sequence = str(seq) - -X = ProteinAnalysis(sequence) - -kd_values = (X.protein_scale(ProtParamData.kd , window = my_window)) # edge weight is set to default (100%) - -# sanity checks -print('Sequence Length:', num_residues) -print('kd_values Length:',len(kd_values)) -print('Window Length:', my_window) -print('Window Offset:', offset) -print('=================================================================') -print('Checking:len(kd values) is as expected for the given window size & offset...') -expected_length = num_residues - (my_window - offset) -if len(kd_values) == expected_length: - print('PASS: expected and actual length of kd values match') -else: - print('FAIL: length mismatch' - ,'\nExpected length:', expected_length - ,'\nActual length:', len(kd_values) - , '\n=========================================================') -#=================== -# creating two dfs -#=================== -#%% make 2 dfs; 1) aa sequence and 2) kd_values. Then reset index for each df -# which will allow easy merging of the two dfs. - -# df1: df of aa seq with index reset to start from 1 (reflective of the actual aa position in a sequence) -# Name column of wt as 'wild_type' to be the same name used in the file required for merging later. -dfSeq = pd.DataFrame({'wild_type_kd':list(sequence)}) -dfSeq.index = np.arange(1, len(dfSeq) + 1) # python is not inclusive - -# df2: df of kd_values with index reset to start from offset + 1 and subsequent matched length of the kd_values -dfVals = pd.DataFrame({'kd_values':kd_values}) -dfVals.index = np.arange(offset + 1, len(dfVals) + 1 + offset) - -# sanity checks -max(dfVals['kd_values']) -min(dfVals['kd_values']) - -#=================== -# concatenating dfs -#=================== -# Merge the two on index -# (as these are now reflective of the aa position numbers): df1 and df2 -# This will introduce NaN where there is missing values. In our case this -# will be 2 (first and last ones based on window size and offset) -# In our case this will be 2 (first and last ones) -# For pnca: the last position is not part of the struc, so not info loss -# Needless to say that this will be variable for other targets. - -kd_df = pd.concat([dfSeq, dfVals], axis = 1) - -#============================ -# renaming index to position -#============================ -kd_df = kd_df.rename_axis('position') -kd_df.head -print('=================================================================') - -print('position col i.e. index should be numeric - , '\n===============================================================') - -if kd_df.index.dtype == 'int64': - print('PASS: position col is numeric' - , '\ndtype is:', kd_df.index.dtype) -else: - print('FAIL: position col is not numeric' - , '\nConverting to numeric') - kd_df.index.astype('int64') - print('Checking dtype for after conversion:\n' - , '\ndtype is:', kd_df.index.dtype - , '\n=========================================================') -#=============== -# writing file -#=============== -print('Writing file:', out_filename - , '\nFilename:', out_filename - , '\nPath:', outdir - , '\n=============================================================') - -kd_df.to_csv(outfile, header = True, index = True) - -print('Finished writing:', out_filename - , '\nNo. of rows:', len(kd_df) - , '\nNo. of cols:', len(kd_df.columns) - , '\n=============================================================') - -#=============== -# plot: optional! -#===============#%% plot -# http://www.dalkescientific.com/writings/NBN/plotting.html - -# FIXME: save fig -# extract just pdb if from 'id' to pass to title of plot -# foo = re.match(r'(^[0-9]{1}\w{3})', id).groups(1) -plot(kd_values, linewidth = 1.0) -#axis(xmin = 1, xmax = num_residues) -xlabel('Residue Number') -ylabel('Hydrophobicity') -title('K&D Hydrophobicity for ' + id) -show() -print('======================================================================') -#%% end of script -#======================================================================= diff --git a/meta_data_analysis/mut_electrostatic_changes.py b/meta_data_analysis/mut_electrostatic_changes.py deleted file mode 100755 index 20ea86a..0000000 --- a/meta_data_analysis/mut_electrostatic_changes.py +++ /dev/null @@ -1,167 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -''' -Created on Tue Aug 6 12:56:03 2019 - -@author: tanu -''' - -# FIXME: import dirs.py to get the basic dir paths available -#======================================================================= -# TASK: calculate how many mutations result in -# electrostatic changes wrt wt - -# Input: mcsm and AF_OR file - -# Output: mut_elec_changes_results.txt -#======================================================================= -#%% load libraries -import os, sys -import pandas as pd -#import numpy as np -#======================================================================= -#%% specify homedir and curr dir -homedir = os.path.expanduser('~') - -# set working dir -os.getcwd() -os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis') -os.getcwd() -#======================================================================= -#%% variable assignment: input and output -drug = 'pyrazinamide' -gene = 'pncA' -gene_match = gene + '_p.' - -#========== -# data dir -#========== -#indir = 'git/Data/pyrazinamide/input/original' -datadir = homedir + '/' + 'git/Data' - -#======= -# input -#======= -indir = datadir + '/' + drug + '/' + 'input' -in_filename = 'merged_df3.csv' -infile = outdir + '/' + in_filename -print('Input filename: ', in_filename - , '\nInput path: ', indir - , '\n============================================================') - -#======= -# output -#======= -outdir = datadir + '/' + drug + '/' + 'output' -# specify output file -out_filename = 'mut_elec_changes.txt' -outfile = outdir + '/' + out_filename -print('Output filename: ', out_filename - , '\nOutput path: ', outdir - , '\n============================================================') - -#%% end of variable assignment for input and output files -#======================================================================= -#%% Read input files -print('Reading input file (merged file):', infile) - -comb_df = pd.read_csv(infile, sep = ',') - -print('Input filename: ', in_filename - , '\nPath :', outdir - , '\nNo. of rows: ', len(comb_df) - , '\nNo. of cols: ', infile - , '\n============================================================') - -# column names -list(comb_df.columns) - -# clear variables -del(in_filename, infile) - -#%% subset unique mutations -df = comb_df.drop_duplicates(['Mutationinformation'], keep = 'first') - -total_muts = df.Mutationinformation.nunique() -#df.Mutationinformation.count() -print('Total mutations associated with structure: ', total_muts - , '\n===============================================================') - -#%% combine aa_calcprop cols so that you can count the changes as value_counts -# check if all muts have been categorised -print('Checking if all muts have been categorised: ') -if df['wt_calcprop'].isna().sum() == 0 & df['mut_calcprop'].isna().sum(): - print('PASS: No. NA detected i.e all muts have aa prop associated' - , '\n===============================================================') -else: - print('FAIL: NAs detected i.e some muts remain unclassified' - , '\n===============================================================') - -df['wt_calcprop'].head() -df['mut_calcprop'].head() - -print('Combining wt_calcprop and mut_calcprop...') -#df['aa_calcprop_combined'] = df['wt_calcprop']+ '->' + df['mut_calcprop'] -df['aa_calcprop_combined'] = df.wt_calcprop.str.cat(df.mut_calcprop, sep = '->') -df['aa_calcprop_combined'].head() - -mut_categ = df["aa_calcprop_combined"].unique() -print('Total no. of aa_calc properties: ', len(mut_categ)) -print('Categories are: ', mut_categ) - -# counting no. of muts in each mut categ - -# way1: count values within each combinaton -df.groupby('aa_calcprop_combined').size() -#df.groupby('aa_calcprop_combined').count() - -# way2: count values within each combinaton -df['aa_calcprop_combined'].value_counts() - -# comment: the two ways should be identical -# groupby result order is similar to pivot table order, -# I prefer the value_counts look - -# assign to variable: count values within each combinaton -all_prop = df['aa_calcprop_combined'].value_counts() - -# convert to a df from Series -ap_df = pd.DataFrame({'aa_calcprop': all_prop.index, 'mut_count': all_prop.values}) - -# subset df to contain only the changes in prop -all_prop_change = ap_df[ap_df['aa_calcprop'].isin(['neg->neg','non-polar->non-polar','polar->polar', 'pos->pos']) == False] - -elec_count = all_prop_change.mut_count.sum() -print('Total no.of muts with elec changes: ', elec_count) - -# calculate percentage of electrostatic changes -elec_changes = (elec_count/total_muts) * 100 - -print('Total number of electrostatic changes resulting from Mutation is (%):', elec_changes) - -# check no change muts -no_change_muts = ap_df[ap_df['aa_calcprop'].isin(['neg->neg','non-polar->non-polar','polar->polar', 'pos->pos']) == True] - -no_change_muts.mut_count.sum() - -#%% output from console -#sys.stdout = open(file, 'w') -sys.stdout = open(outfile, 'w') - -print('======================\n' - ,'Unchanged muts' - ,'\n=====================\n' - , no_change_muts - ,'\n=============================\n' - , 'Muts with changed prop:' - , '\n============================\n' - , all_prop_change) - -print('=================================================================' -, '\nTotal number of electrostatic changes resulting from Mtation is (%):', elec_changes -, '\nTotal no. of muts: ', total_muts -, '\nTotal no. of changed muts: ', all_prop_change.mut_count.sum() -, '\nTotal no. of unchanged muts: ', no_change_muts.mut_count.sum() -, '\n===================================================================') -#%% end of script -#======================================================================= diff --git a/meta_data_analysis/mutate.py b/meta_data_analysis/mutate.py deleted file mode 100644 index fcabc70..0000000 --- a/meta_data_analysis/mutate.py +++ /dev/null @@ -1,179 +0,0 @@ -#!/usr/bin/python -from __future__ import print_function -from Bio import SeqIO -from Bio.Seq import Seq -from collections import OrderedDict -import sys -import argparse -# https://github.com/jrjhealey/bioinfo-tools/blob/master/Mutate.py -# https://www.biostars.org/p/336891/ - -# TODO: -# - create some logic to 'group' mutations that will be applied to the same sequence, to -# make all switches at once -# - This will also probably break the verbose transversion output so the maths will need replacing -# - Create the ability to support INDELS (will also require pairwise alignment so that -# hamming distances remain meaningful. - - -def get_args(): - """Parse command line arguments""" - desc = "Mutate fasta sequences based on a file of sequence mappings." - epi = ( - "This script takes a mapfile of the form:\n" - " SequenceID,A123B\n" - " SequenceID,X456Y\n" - "And performs substitutions/mutations. At preset it only does one SNP per sequence.\n" - ) - - try: - parser = argparse.ArgumentParser( - description=desc, epilog=epi, formatter_class=argparse.RawTextHelpFormatter - ) - parser.add_argument( - "mutation_file", - action="store", - help='File of mutation mappings like so: "SeqID,X123Y"', - ) - parser.add_argument( - "sequences", - action="store", - help="File of sequences to be mutated (fasta only).", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - help="Verbose behaviour, printing parameters of the script.", - ) - parser.add_argument( - "-o", - "--outfile", - action="store", - help="Output file for mutated sequences (default STDOUT).", - ) - if len(sys.argv) == 1: - parser.print_help(sys.stderr) - exit(1) - except: - sys.stderr.write( - "An exception occurred with argument parsing. Check your provided options.\n" - ) - - return parser.parse_args() - - -class Mutation(object): - """A class wrapper for sequence IDs so that duplicate IDs can be used in a dictionary""" - - def __init__(self, name): - self.name = name - - def __repr__(self): - return "'" + self.name + "'" - - def __str__(self): - return self.name - - -def parse_mapfile(mapfile): - """Return a dict of mapped mutations. - File should resemble: - SequenceID,A123B - SequenceID2,X234Y - Sequence IDs should exactly match the fasta headers, as parsed by BioPython. - (">" symbols are optional) - """ - - with open(mapfile, "r") as handle: - mut_dict = OrderedDict() - for line in handle: - id, change = line.lstrip(">").rstrip("\n").split(",") - mut_dict[Mutation(id)] = change - - for k, v in mut_dict.items(): - assert v[0].isalpha(), ( - "First character of mutation map is not a valid letter. Got: %s" % v[0] - ) - assert v[-1].isalpha(), ( - "Last character of mutation map is not a valid letter. Got: %s" % v[-1] - ) - assert v[1:-1].isdigit(), ( - "Location string of mutation map is not a valid number. Got: %s" % v[1:-1] - ) - - return mut_dict - - -def morph(orig, loc, new, mutableseq, verbose): - """Perform actual sequence change (polymorphism only at present)""" - # Shift location to offset 0-based index - loc = loc - 1 - assert mutableseq[loc] == orig, ( - "Sequence does not match the mutation file for pre-exising residue. Expected %s , got %s " - % (orig, mutableseq[loc]) - ) - if verbose is True: - print( - "Performing change: {} -> {}, at location: {} (0 based)".format( - orig, new, loc - ) - ) - mutableseq[loc] = new - return mutableseq - - -def hamming_distance(s1, s2): - """Return the Hamming distance between equal-length sequences""" - if len(s1) != len(s2): - raise ValueError("Undefined for sequences of unequal length") - return sum(ch1 != ch2 for ch1, ch2 in zip(s1.upper(), s2.upper())) - - -def main(): - args = get_args() - if args.outfile is not None: - ofh = open(args.outfile, "w") - - # Parse the mutation file (get mutations by sequence) - mutations = parse_mapfile(args.mutation_file) - if args.verbose is True: - print("Got mutations:") - print(mutations) - # Iterate all sequences and make any substitutions necessary - for record in SeqIO.parse(args.sequences, "fasta"): - for k, v in mutations.items(): - mutable = record.seq.upper().tomutable() -# mutable = record.seq.tomutable() -# print("MO:", mutable) - if k.name == record.id[0:4]: # BEWARE HARDCODING -# print("k.name:", k.name, "record.id:", record.id) - orig = v[0] - print("orig:", orig) - new = v[-1] - loc = int(v[1:-1]) - if args.verbose: - print(record.id) - newseq = morph(orig, loc, new, mutable, args.verbose) -# print("NS is:", newseq) - if args.verbose is True: - print("Original: " + record.seq.upper()) - print( - str((" " * int(loc - 2 + 11))) + "V" - ) # Padded string to show where switch happened (not sure how it'll deal with line wrapping - print("New: " + newseq) - print( - "Distance: " - + str(hamming_distance(str(record.seq), str(newseq))) - ) - if args.outfile is not None: - ofh.write(">%s_%s\n%s\n" % (record.id, v, newseq)) - if args.verbose is False: - print(">%s_%s\n%s\n" % (record.id, v, newseq)) - - if args.outfile is not None: - ofh.close() - - -if __name__ == "__main__": - main() diff --git a/meta_data_analysis/rd_df.py b/meta_data_analysis/rd_df.py deleted file mode 100755 index 0074959..0000000 --- a/meta_data_analysis/rd_df.py +++ /dev/null @@ -1,135 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -''' -Created on Tue Aug 6 12:56:03 2019 - -@author: tanu -''' -#============================================================================= -# Task: Residue depth (rd) processing to generate a df with residue_depth(rd) -# values - -# FIXME -# Input: '.tsv' i.e residue depth txt file (output from .zip file manually -# downloaded from the website). -# This should be integrated into the pipeline - -# Output: .csv with 3 cols i.e position, rd_values & 3-letter wt aa code(caps) -#============================================================================= -#%% load packages -import sys, os -import pandas as pd -#import numpy as np -#============================================================================= -#%% specify input and curr dir -homedir = os.path.expanduser('~') - -# set working dir -os.getcwd() -os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis') -os.getcwd() -#============================================================================= -#%% variable assignment: input and output -drug = 'pyrazinamide' -gene = 'pncA' -gene_match = gene + '_p.' - -#========== -# data dir -#========== -#indir = 'git/Data/pyrazinamide/input/original' -datadir = homedir + '/' + 'git/Data' - -#======= -# input -#======= -#indir = 'git/Data/pyrazinamide/input/original' -indir = datadir + '/' + drug + '/' + 'output' -in_filename = '3pl1_rd.tsv' -infile = indir + '/' + in_filename -print('Input filename:', in_filename - , '\nInput path:', indir - , '\n=============================================================') -#======= -# output -#======= -outdir = datadir + '/' + drug + '/' + 'output' -out_filename = gene.lower() + '_rd.csv' -outfile = outdir + '/' + out_filename -print('Output filename:', out_filename - , '\nOutput path:', outdir - , '\n=============================================================') - -#%% end of variable assignment for input and output files -#======================================================================= -#%% Read input file -rd_data = pd.read_csv(infile, sep = '\t') -print('Reading input file:', infile - , '\nNo. of rows:', len(rd_data) - , '\nNo. of cols:', len(rd_data.columns)) - -print('Column names:', rd_data.columns - , '\n===============================================================') -#======================== -# creating position col -#======================== -# Extracting residue number from index and assigning -# the values to a column [position]. Then convert the position col to numeric. -rd_data['position'] = rd_data.index.str.extract('([0-9]+)').values - -# converting position to numeric -rd_data['position'] = pd.to_numeric(rd_data['position']) -rd_data['position'].dtype - -print('Extracted residue num from index and assigned as a column:' - , '\ncolumn name: position' - , '\ntotal no. of cols now:', len(rd_data.columns) - , '\n=============================================================') - -#======================== -# Renaming amino-acid -# and all-atom cols -#======================== -print('Renaming columns:' - , '\ncolname==> # chain:residue: wt_3letter_caps' - , '\nYES... the column name *actually* contains a # ..!' - , '\ncolname==> all-atom: rd_values' - , '\n=============================================================') - -rd_data.rename(columns = {'# chain:residue':'wt_3letter_caps', 'all-atom':'rd_values'}, inplace = True) -print('Column names:', rd_data.columns) - -#======================== -# extracting df with the -# desired columns -#======================== -print('Extracting relevant columns for writing df as csv') - -rd_df = rd_data[['position','rd_values','wt_3letter_caps']] - -if len(rd_df) == len(rd_data): - print('PASS: extracted df has expected no. of rows' - ,'\nExtracted df dim:' - ,'\nNo. of rows:', len(rd_df) - ,'\nNo. of cols:', len(rd_df.columns)) -else: - print('FAIL: no. of rows mimatch' - , '\nExpected no. of rows:', len(rd_data) - , '\nGot no. of rows:', len(rd_df) - , '\n=========================================================') - -#%% write file -print('Writing file:' - , '\nFilename:', out_filename - , '\nPath:', outdir - , '\n=============================================================') - -rd_df.to_csv(outfile, header = True, index = False) - -print('Finished writing:', out_filename - , '\nNo. of rows:', len(rd_df) - , '\nNo. of cols:', len(rd_df.columns) - , '\n=============================================================') - -#%% end of script -#======================================================================= diff --git a/meta_data_analysis/run_mutate.sh b/meta_data_analysis/run_mutate.sh deleted file mode 100755 index 1865a5e..0000000 --- a/meta_data_analysis/run_mutate.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -#https://www.biostars.org/p/336891/ -#python Mutate.py -v -o /path/to/output.fasta mutation_map_file.csv input.fasta - -# pnca_all_muts_msa_FIXME: This should be formatted like this from python script -# change to a cmd script that takes this "prefix" as an input -for i in $(cat pnca_all_muts_msa_FIXME.csv); do echo "3PL1,${i}"; done > pnca_copy.txt - -# make sure there is no new line at the end of the mutation file (snps.csv) -#python3 Mutate.py -v -o /home/tanu/git/Data/pyrazinamide/input/output.fasta mut_map.csv 3pl1.fasta.txt -python3 mutate.py -v -o /home/tanu/git/Data/pyrazinamide/output/pnca_msa.txt /home/tanu/git/Data/pyrazinamide/output/pnca_all_muts_msa.csv /home/tanu/git/Data/pyrazinamide/input/pnca_fasta.txt - -# remove fasta style header lines in the output i.e -# lines beginning with '>' so the file is just the mutated seqs -sed -i '/^>.*$/d' /home/tanu/git/Data/pyrazinamide/output/pnca_msa.txt -printf 'No. of lines after cleaning: ' -cat /home/tanu/git/Data/pyrazinamide/output/pnca_msa.txt | wc -l diff --git a/meta_data_analysis/run_pdb_dssp.sh b/meta_data_analysis/run_pdb_dssp.sh deleted file mode 100755 index 1ad3b36..0000000 --- a/meta_data_analysis/run_pdb_dssp.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/bin/bash -#======================================================================= -# Task: read a pdb file and generate a dssp output file -# Input: -# pdb file - -# Output: -# pdb_code.dssp -# needs dssp exe on linux -# more efficient to run dssp exe locally - -# note: double quotes for variable interpolation -#======================================================================= -# #%%specify variables for input and output paths and filenames -drug='pyrazinamide' -gene='pncA' -#convert to lowercase for consistency in filenames -gene_l=$(printf $gene | awk '{print tolower($0)}') -gene_match="${gene}_p." -#printf "${gene_match}\n" - -#========== -# data dir -#========== -#indir = 'git/Data/pyrazinamide/input/original' -#datadir="~git/Data" -datadir="${HOME}/git/Data" - -#======= -# input -#======= -indir=${datadir}'/'${drug}'/input' -printf "Input dir: ${indir}\n" -in_filename='3pl1.pdb' -#infile=${basedir}${inpath}${in_filename} -infile=${indir}'/'${in_filename} -printf "Input file: ${infile}\n" - -#======= -# output -#======= -outdir=${datadir}'/'${drug}$'/output' -printf "Output dir: ${outdir}" -#out_filename='3pl1.dssp' -out_filename="${gene_l}.dssp" -outfile=${outdir}'/'${out_filename} -printf "Output file: ${outfile}\n" - -#%%end of variable assignment for input and output files -#================================================================ -# command line arg to run dssp and create output file -dssp -i ${infile} -o ${outfile} -printf "Finished writing: ${outfile}\n" -#printf "Filename: ${out_filename}\nlocated in: ${outdir}\n" diff --git a/plotting_test/logo_plot.R b/plotting_test/logo_plot.R deleted file mode 100644 index be1d9d7..0000000 --- a/plotting_test/logo_plot.R +++ /dev/null @@ -1,221 +0,0 @@ -#======================================================================= -# Task: To generate a logo plot or bar plot but coloured -# aa properties. -# step1: read mcsm file and OR file -# step2: plot wild type positions -# step3: plot mutants per position coloured by aa properties -# step4: make the size of the letters/bars prop to OR if you can! - -# useful links -# https://stackoverflow.com/questions/5438474/plotting-a-sequence-logo-using-ggplot2 -# https://omarwagih.github.io/ggseqlogo/ -# https://kkdey.github.io/Logolas-pages/workflow.html -# A new sequence logo plot to highlight enrichment and depletion. -# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6288878/ - -#very good: http://www.cbs.dtu.dk/biotools/Seq2Logo-2.0/ -#======================================================================= -#%% specify curr dir -getwd() -setwd('~/git/LSHTM_analysis/plotting_test/') -getwd() -#======================================================================= -#%% load packages - -# header file -header_dir = '~/git/LSHTM_analysis/' -source(paste0(header_dir, '/', 'my_header.R')) -#======================================================================= -#%% variable assignment: input and output paths & filenames -drug = 'pyrazinamide' -gene = 'pncA' -gene_match = paste0(gene,'_p.') -cat(gene_match) - -#=========== -# data dir -#=========== -datadir = paste0('~/git/Data') - -#=========== -# input -#=========== -# source R script 'combining_two_df.R' -#indir = paste0(datadir, '/', drug, '/', 'output') # reading files -indir = '../meta_data_analysis' # sourcing R script -in_filename = 'combining_df_ps.R' -infile = paste0(indir, '/', in_filename) -cat(paste0('Input is a R script: ', '\'', infile, '\'') - , '\n========================================================') - -#=========== -# output -#=========== -# 1) lineage dist of all muts -outdir = paste0('~/git/Data', '/', drug, '/', 'output/plots') #same as indir -#cat('Output dir: ', outdir, '\n') -#file_type = '.svg' -#out_filename1 = paste0(tolower(gene), '_lineage_dist_ps', file_type) -#outfile1 = paste0(outdir, '/', out_filename1) -#cat(paste0('Output plot1 :', outfile1) -# , '\n========================================================') - -#%% end of variable assignment for input and output files -#======================================================================= -##%% read input file -cat('Reading input file(sourcing R script):', in_filename) - -source(infile) - -#========================== -# This will return: - -# df with NA for pyrazinamide: -# merged_df2 -# merged_df3 - -# df without NA for pyrazinamide: -# merged_df2_comp -# merged_df3_comp -#=========================== - -########################### -# Data for plots -# you need merged_df2 or merged_df2_comp -# since this is one-many relationship -# i.e the same SNP can belong to multiple lineages -# using the _comp dataset means -# we lose some muts and at this level, we should use -# as much info as available, hence use df with NA - -# This will the first plotting df -# Then subset this to extract dr muts only (second plottig df) -########################### - -#%%%%%%%%%%%%%%%%%%%%%%%%% -# uncomment as necessary -# REASSIGNMENT -#my_data = merged_df2 -#my_data = merged_df2_comp -#my_data = merged_df3 -my_data = merged_df3_comp -#%%%%%%%%%%%%%%%%%%%%%%%%%% - -# delete variables not required -rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) - -# quick checks -colnames(my_data) -str(my_data) - -c1 = unique(my_data$Position) -nrow(my_data) -cat('No. of rows in my_data:', nrow(my_data) - , '\nDistinct positions corresponding to snps:', length(c1) - , '\n===========================================================') - -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -# FIXME: Think and decide what you want to remove -# mut_pos_occurence < 1 or sample_pos_occurrence <1 - -# get freq count of positions so you can subset freq<1 -require(data.table) -#setDT(my_data)[, mut_pos_occurrence := .N, by = .(Position)] #265, 14 - -#extract freq_pos>1 -#my_data_snp = my_data[my_data$occurrence!=1,] - -#u = unique(my_data_snp$Position) #73 -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - -#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -# REASSIGNMENT to prevent changing code -my_data_snp = my_data -#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -#======================================================================= -#%% logo plots from dataframe - -############# -# PLOTS: ggseqlogo with custom height -# https://omarwagih.github.io/ggseqlogo/ -############# -#require(ggplot2) -#require(tidyverse) -library(ggseqlogo) - -foo = my_data_snp[, c("Position", "Mutant_type","ratioDUET", "OR" - , "mut_prop_polarity", "mut_prop_water") ] - -# log10OR -# FIXME: at the source script (when calculating AFandOR) -my_data_snp$log10or = log10(my_data_snp$OR) -bar = my_data_snp[, c('Position', 'Mutant_type', 'OR', 'logor', 'log10or')] - - -bar_or = my_data_snp[, c('Position', 'Mutant_type', 'OR')] -wide_df_or <- bar_or %>% spread(Position, OR, fill = 0) -wide_df_or = as.matrix(wide_df_or) -rownames(wide_df_or) = wide_df_or[,1] -wide_df_or = wide_df_or[,-1] - -# custom height (OR) logo plot: yayy works -ggseqlogo(wide_df_or, method='custom', seq_type='aa') + ylab('my custom height') + - theme(legend.position = "bottom" - , axis.text.x = element_text(size = 11 - , angle = 90 - , hjust = 1 - , vjust = 0.4) - , axis.text.y = element_text(size = 15 - , angle = 0 - , hjust = 1 - , vjust = 0))+ - labs(title = "AA logo plot" - , x = "Wild-type Position" - , y = "OR") -#%% end of logo plot with OR as height -#======================================================================= -# extracting data with log10OR -bar_logor = my_data_snp[, c('Position', 'Mutant_type', 'log10or')] -wide_df_logor <- bar_logor %>% spread(Position, log10or, fill = 0) - -wide_df_logor = as.matrix(wide_df_logor) -rownames(wide_df_logor) = wide_df_logor[,1] -wide_df_logor = wide_df_logor[,-1] - -# custom height (log10OR) logo plot: yayy works -ggseqlogo(wide_df_logor, method='custom', seq_type='aa') + ylab('my custom height') + - theme(legend.position = "bottom" - , axis.text.x = element_text(size = 11 - , angle = 90 - , hjust = 1 - , vjust = 0.4) - , axis.text.y = element_text(size = 15 - , angle = 0 - , hjust = 1 - , vjust = 0))+ - labs(title = "AA logo plot" - , x = "Wild-type Position" - , y = "Log10(OR)") - -#======================================================================= -#%% logo plot from sequence - -################# -# Plot: LOGOLAS (ED plots) -# link: https://github.com/kkdey/Logolas -# on all pncA samples: output of mutate.py -################ -library(Logolas) - -seqs = read.csv('~/git//Data/pyrazinamide/output/pnca_msa.txt' - , header = FALSE - , stringsAsFactors = FALSE)$V1 - - -# my_data: useful! -logomaker(seqs, type = "EDLogo", color_type = 'per_symbol' - , return_heights = TRUE) -logomaker(seqs, type = "Logo", color_type = 'per_symbol') - -#%% end of script -#======================================================================= \ No newline at end of file