adding clean files for rerrun 35k dataset

This commit is contained in:
Tanushree Tunstall 2020-07-07 18:28:55 +01:00
parent 943513a338
commit a7f21cfb14
32 changed files with 157 additions and 44550 deletions

View file

@ -1,319 +0,0 @@
#!/usr/bin/env Rscript
#require('compare')
require('getopt', quietly=TRUE) # We need to be able to parse arguments
#########################################################
# TASK: To calculate Allele Frequency and
# Odds Ratio from master data
# and add the calculated params to meta_data extracted from
# data_extraction.py
#########################################################
#getwd()
setwd('~/git/LSHTM_analysis/scripts')
cat(c(getwd(),'\n'))
# Command line args
spec = matrix(c(
"drug" , "d", 1, "character",
"gene" , "g", 1, "character"
), byrow = TRUE, ncol = 4)
opt = getopt(spec);
drug = opt$drug
gene = opt$gene
if(is.null(drug)|is.null(gene)) {
stop('Missing arguments: --drug and --gene must both be specified (case-sensitive)')
}
#options(scipen = 999) #disabling scientific notation in R.
#options(scipen = 4)
#%% variable assignment: input and output paths & filenames
drug = 'pyrazinamide'
gene = 'pncA'
gene_match = paste0(gene,'_p.')
cat(gene_match)
#===========
# input
#===========
# infile1: Raw data
#indir = 'git/Data/pyrazinamide/input/original'
indir = paste0('~/git/Data')
in_filename = 'original_tanushree_data_v2.csv'
#in_filename = 'mtb_gwas_v3.csv'
infile = paste0(indir, '/', in_filename)
cat(paste0('Reading infile1: raw data', ' ', infile) )
# infile2: gene associated meta data file to extract valid snps and add calcs to.
# This is outfile3 from data_extraction.py
indir_metadata = paste0('~/git/Data', '/', drug, '/', 'output')
in_filename_metadata = 'pnca_metadata.csv'
infile_metadata = paste0(indir_metadata, '/', in_filename_metadata)
cat(paste0('Reading infile2: gene associated metadata:', infile_metadata))
#===========
# output
#===========
# outdir = 'git/Data/pyrazinamide/output'
outdir = paste0('~/git/Data', '/', drug, '/', 'output')
#out_filename = paste0(tolower(gene), '_meta_data_with_AF_OR.csv')
out_filename = paste0(tolower(gene), '_af_or.csv')
outfile = paste0(outdir, '/', out_filename)
cat(paste0('Output file with full path:', outfile))
#%% end of variable assignment for input and output files
#########################################################
# 1: Read master/raw data stored in Data/
#########################################################
raw_data_all = read.csv(infile, stringsAsFactors = F)
# building cols to extract
dr_muts_col = paste0('dr_mutations_', drug)
other_muts_col = paste0('other_mutations_', drug)
cat('Extracting columns based on variables:\n'
, drug
, '\n'
, dr_muts_col
, '\n'
, other_muts_col
, '\n===============================================================')
raw_data = raw_data_all[,c("id"
, drug
, dr_muts_col
, other_muts_col)]
rm(raw_data_all)
rm(indir, in_filename, infile)
#===========
# 1a: exclude na
#===========
raw_data = raw_data[!is.na(raw_data[[drug]]),]
total_samples = length(unique(raw_data$id))
cat(paste0('Total samples without NA in', ' ', drug, 'is:', total_samples))
# sanity check: should be true
is.numeric(total_samples)
#===========
# 1b: combine the two mutation columns
#===========
all_muts_colname = paste0('all_mutations_', drug)
#raw_data$all_mutations_pyrazinamide = paste(raw_data$dr_mutations_pyrazinamide, raw_data$other_mutations_pyrazinamide)
raw_data[[all_muts_colname]] = paste(raw_data[[dr_muts_col]], raw_data[[other_muts_col]])
head(raw_data[[all_muts_colname]])
#===========
# 1c: create yet another column that contains all the mutations but in lower case
#===========
head(raw_data[[all_muts_colname]])
raw_data$all_muts_gene = tolower(raw_data[[all_muts_colname]])
head(raw_data$all_muts_gene)
# sanity checks
#table(grepl("gene_p",raw_data$all_muts_gene))
cat(paste0('converting gene match:', gene_match, ' ', 'to lowercase'))
gene_match = tolower(gene_match)
table(grepl(gene_match,raw_data$all_muts_gene))
# sanity check: should be TRUE
#sum(table(grepl("gene_p",raw_data$all_muts_gene))) == total_samples
# sanity check
if(sum(table(grepl(gene_match, raw_data$all_muts_gene))) == total_samples){
cat('PASS: Total no. of samples match')
} else{
cat('FAIL: No. of samples mismatch')
}
#########################################################
# 2: Read valid snps for which OR
# can be calculated
#########################################################
cat(paste0('Reading metadata infile:', infile_metadata))
gene_metadata = read.csv(infile_metadata
#, file.choose()
, stringsAsFactors = F
, header = T)
# clear variables
rm(in_filename_metadata, infile_metadata)
# count na in pyrazinamide column
tot_pza_na = sum(is.na(gene_metadata$pyrazinamide))
expected_rows = nrow(gene_metadata) - tot_pza_na
# drop na from the pyrazinamide colum
gene_snps_or = gene_metadata[!is.na(gene_metadata[[drug]]),]
# sanity check
if(nrow(gene_snps_or) == expected_rows){
cat('PASS: no. of rows match with expected_rows')
} else{
cat('FAIL: nrows mismatch.')
}
# extract unique snps to iterate over for AF and OR calcs
gene_snps_unique = unique(gene_snps_or$mutation)
cat(paste0('Total no. of distinct comp snps to perform OR calcs: ', length(gene_snps_unique)))
#===========================================================================================
#########################
# custom chisq function:
# To calculate OR
#########################
i = "pnca_p.trp68gly"
mut = grepl(i,raw_data$all_muts_gene)
mut = as.numeric(mut)
dst = raw_data[[drug]]
#x = as.numeric(mut)
#y = dst
mychisq_or = function(x,y){
tab = as.matrix(table(x,y))
a = tab[2,2]
if (a==0){ a<-0.5}
b = tab[2,1]
if (b==0){ b<-0.5}
c = tab[1,2]
if (c==0){ c<-0.5}
d = tab[1,1]
if (d==0){ d<-0.5}
(a/b)/(c/d)
}
or_mychisq = mychisq_or(dst, mut)
print(paste0('mychisq OR:', or_mychisq ))
#=====================================
#OR calcs using the following 4
#1) chisq.test
#2) fisher
#3) modified chisq.test
#4) logistic
#5) adjusted logistic?
#6) kinship (separate script)
#======================================
# TEST FOR a few muts: sapply and df
#===============================================
snps <- gene_snps_unique # reassign so you test with subset of muts
#snps <- gene_snps_unique[1:2]
cat(paste0('Running calculations for:', length(snps), ' nssnps\n'
, 'gene: ', gene
, '\ndrug: ', drug ))
# DV: pyrazinamide 0 or 1
dst = raw_data[[drug]]
# initialise an empty df
ors_df = data.frame()
x = sapply(snps,function(m){
mut = grepl(m,raw_data$all_muts_gene)
mut = as.numeric(mut)
cat(paste0('Running mutation:', m, '\n'))
model<-glm(dst ~ mut, family = binomial)
#-------------------
# allele frequency
#-------------------
afs = mean(mut)
#-------------------
# logistic model
#-------------------
beta_logistic = summary(model)$coefficients[2,1]
or_logistic = exp(summary(model)$coefficients[2,1])
#print(paste0('logistic OR:', or_logistic))
pval_logistic = summary(model)$coefficients[2,4]
#print(paste0('logistic pval:', pval_logistic))
se_logistic = summary(model)$coefficients[2,2]
#print(paste0('logistic SE:', se_logistic))
zval_logistic = summary(model)$coefficients[2,3]
#print(paste0('logistic zval:', zval_logistic))
ci_mod = exp(confint(model))[2,]
#print(paste0('logistic CI:', ci_mod))
ci_lower_logistic = ci_mod[["2.5 %"]]
ci_upper_logistic = ci_mod[["97.5 %"]]
#-------------------
# custom_chisq and fisher: OR p-value and CI
#-------------------
or_mychisq = mychisq_or(dst, mut)
#print(paste0('mychisq OR:', or_mychisq))
odds_fisher = fisher.test(table(dst, mut))$estimate
or_fisher = odds_fisher[[1]]
pval_fisher = fisher.test(table(dst, mut))$p.value
ci_lower_fisher = fisher.test(table(dst, mut))$conf.int[1]
ci_upper_fisher = fisher.test(table(dst, mut))$conf.int[2]
#-------------------
# chi sq estimates
#-------------------
estimate_chisq = chisq.test(table(dst, mut))$statistic; estimate_chisq
est_chisq = estimate_chisq[[1]]; print(est_chisq)
pval_chisq = chisq.test(table(dst, mut))$p.value
# build a row to append to df
row = data.frame(mutation = m
, af = afs
, beta_logistic = beta_logistic
, or_logistic = or_logistic
, pval_logistic = pval_logistic
, se_logistic = se_logistic
, zval_logistic = zval_logistic
, ci_low_logistic = ci_lower_logistic
, ci_hi_logistic = ci_upper_logistic
, or_mychisq = or_mychisq
, or_fisher = or_fisher
, pval_fisher = pval_fisher
, ci_low_fisher= ci_lower_fisher
, ci_hi_fisher = ci_upper_fisher
, est_chisq = est_chisq
, pval_chisq = pval_chisq
)
#print(row)
ors_df <<- rbind(ors_df, row)
})
#%%======================================================
# Writing file with calculated ORs and AFs
cat(paste0('writing output file: '
, '\nFilename: ', out_filename))
write.csv(ors_df, outfile
, row.names = F)
cat(paste0('Finished writing:'
, outfile
, '\nNo. of rows: ', nrow(ors_df)
, '\nNo. of cols: ', ncol(ors_df)))
#************************************************
cat('\n======================================================================\n')
cat('End of script: calculated AF, OR, pvalues and saved file')

View file

@ -1,51 +0,0 @@
#!/usr/bin/env python3
from Bio import SeqIO
from Bio import pairwise2
from Bio.pairwise2 import format_alignment
import re
import os
#%%
def myalign(ref_seq, pdb_seq):
myalign_dict = {}
alignments = pairwise2.align.globalxx(ref_seq, pdb_seq)
#alignments = pairwise2.align.localxx(ref, struct)
match = []
for a, b in zip(alignments[0][0], alignments[0][1]):
if a == b:
match.append('|')
else:
match.append(' ')
#print(match)
print(alignments[0][0])
print("".join(match))
print(alignments[0][1])
result_align = alignments[0][1]
#print(result_align)
print('===============================================================\n')
# update dict
myalign_dict.update({'aligned_fasta': result_align})
# find start and end of match
aa_regex = '\w'
m = re.search(aa_regex, result_align)
#m = my_match.span()
offset = m.start()
offset_end = m.end()
print('start of match:', offset
, '\nend of match:', offset_end)
print('===============================================================\n')
# update dict
myalign_dict.update({'start_match' : offset})
myalign_dict.update({'end_match' : offset_end})
return myalign_dict

View file

@ -1,24 +0,0 @@
#!/usr/bin/python3
#=======================================================================
# TASK: select specified chains from the pdb & save a cropped PDB with
# the selected chains. Useful for dimer, etc modelling.
# link for saving each chain as a separate file
# https://stackoverflow.com/questions/11685716/how-to-extract-chains-from-a-pdb-file
#=======================================================================
from Bio.PDB import PDBParser, PDBIO, Select
# Select() Method to return True for every chain in 'chains'
class ChainExtract(Select):
def __init__(self, chain):
self.chain = chain
def accept_chain(self, chain):
#print(dir(chain))
if chain.id in self.chain:
return 1
else:
return 0

View file

@ -1,28 +0,0 @@
#!/usr/bin/python3
#=======================================================================
# TASK: extract chain from pdb and save each chain as a separate file
# link for saving each chain as a separate file
#=======================================================================
__description__ = \
"""
pdb_chain_splitter.py
extracts chains and saves them as separate pdb files.
"""
__author__ = "Tanushree Tunstall"
__date__ = ""
from Bio.PDB import Select, PDBIO
from Bio.PDB.PDBParser import PDBParser
class ChainSelect(Select):
def __init__(self, chain):
self.chain = chain
def accept_chain(self, chain):
if chain.get_id() == self.chain:
return 1
else:
return 0

View file

@ -1,177 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
'''
Created on Tue Aug 6 12:56:03 2019
@author: tanu
'''
# FIXME: change filename 2(mcsm normalised data)
# to be consistent like (pnca_complex_mcsm_norm.csv) : changed manually, but ensure this is done in the mcsm pipeline
#=======================================================================
# Task: combine 2 dfs on comm_valson cols by detecting them
# includes sainity checks
#=======================================================================
#%% load packages
import sys, os
import pandas as pd
import numpy as np
import re
#from varname import nameof
#%% end of variable assignment for input and output files
#=======================================================================
#%% function/methd to combine dfs
def detect_common_cols (df1, df2):
"""
Detect comm_valson cols
@param df1: df
@type df1: pandas df
@param df2: df
@type df2: pandas df
@return: comm_valson cols
@type: list
"""
common_cols = np.intersect1d(df1.columns, df2.columns).tolist()
print('Length of comm_cols:', len(common_cols)
, '\nmerging column/s:', common_cols
, '\ntype:', type(common_cols)
, '\ndtypes in merging columns:\n', df1[common_cols].dtypes)
return common_cols
#%% Function to combine 2 dfs by detecting commom cols and performing
# sanity checks on the output df
def combine_dfs_with_checks(df1, df2, my_join = 'outer'):
"""
Combine 2 dfs by finding merging columns automatically
@param df1: data frame
@type df1: pandas df
@param df2: data frame
@type df2: pandas df
@my_join: join type for merging
@type my_join: string
@return: combined_df
@type: pandas df
"""
print('Finding comm_cols and merging cols:'
,'\n=========================================================')
common_cols = np.intersect1d(df1.columns, df2.columns).tolist()
print('Length of comm_cols:', len(common_cols)
, '\nmerging column/s:', common_cols
, '\ntype:', type(common_cols))
#print('\ndtypes in merging columns:\n', df1[common_cols].dtypes)
print('selecting consistent dtypes for merging (object i.e string)')
#merging_cols = df1[comm_valson_cols].select_dtypes(include = [object]).columns.tolist()
#merging_cols = df1[comm_valson_cols].select_dtypes(include = ['int64']).columns.tolist()
merging_cols = common_cols.copy()
nmerging_cols = len(merging_cols)
print(' length of merging cols:', nmerging_cols
, '\nmerging cols:', merging_cols, 'type:', type(merging_cols)
, '\n=========================================================')
#========================
# merge 1 (combined_df)
# concatenating 2dfs:
# df1, df2
#========================
# checking cross-over of mutations in the two dfs to merge
ndiff_1 = df1[merging_cols].squeeze().isin(df2[merging_cols].squeeze()).sum()
ndiff1 = df1.shape[0] - ndiff_1
print('There are', ndiff1, 'unmatched mutations in left df')
#missing_mutinfo = df1[~left_df['mutationinformation'].isin(df2['mutationinformation'])]
#missing_mutinfo.to_csv('infoless_muts.csv')
ndiff_2 = df2[merging_cols].squeeze().isin(df1[merging_cols].squeeze()).sum()
ndiff2 = df2.shape[0] - ndiff_2
print('There are', ndiff2, 'unmatched mutations in right_df')
#comm_vals = np.intersect1d(df1[merging_cols], df2[merging_cols])
#comm_vals_count = len(comm_vals)
#print('length of comm_valson values:', comm_vals_count , '\ntype:', type(comm_vals_count))
#========================
# merging dfs & sanity checks
#========================
fail = False
print('combing with:', my_join)
comb_df = pd.merge(df1, df2, on = merging_cols, how = my_join)
expected_cols = df1.shape[1] + df2.shape[1] - nmerging_cols
if my_join == 'right':
df2_nd = df2.drop_duplicates(merging_cols, keep = 'first')
expected_rows = df2_nd.shape[0]
if my_join == 'left':
expected_rows = df1.shape[0]
#if my_join == 'inner':
# expected_rows = comm_vals_count
#if my_join == 'outer':
# df1_nd = df1.drop_duplicates(merging_cols, keep = 'first')
# df2_nd = df2.drop_duplicates(merging_cols, keep = 'first')
# expected_rows = df1_nd.shape[0] + df2_nd.shape[0] - comm_vals_count
if my_join == ('inner' or 'outer') and len(merging_cols) > 1:
#comm_vals = np.intersect1d(df1['mutationinformation'], df2['mutationinformation'])
print('length of merging_cols > 1, therefore omitting row checks')
combined_df = comb_df.copy()
expected_rows = len(combined_df)
else:
comm_vals = np.intersect1d(df1[merging_cols], df2[merging_cols])
print('length of merging_cols == 1, calculating expected rows in merged_df')
combined_df = comb_df.drop_duplicates(subset = merging_cols, keep ='first')
if my_join == 'inner':
expected_rows = len(comm_vals)
if my_join == 'outer':
df1_nd = df1.drop_duplicates(merging_cols, keep = 'first')
df2_nd = df2.drop_duplicates(merging_cols, keep = 'first')
expected_rows = df1_nd.shape[0] + df2_nd.shape[0] - len(comm_vals)
if len(combined_df) == expected_rows and len(combined_df.columns) == expected_cols:
print('PASS: successfully combined dfs with:', my_join, 'join')
else:
print('FAIL: combined_df\'s expected rows and cols not matched')
fail = True
print('\nExpected no. of rows:', expected_rows
, '\nGot:', len(combined_df)
, '\nExpected no. of cols:', expected_cols
, '\nGot:', len(combined_df.columns))
if fail:
sys.exit()
#if clean:
#foo = combined_df2.filter(regex = r'.*_x|_y', axis = 1)
#print(foo.columns)
#print('Detected duplicate cols with suffix: _x _y'
# , '\Dropping duplicate cols and cleaning')
# drop position col containing suffix '_y' and then rename col without suffix
combined_df_clean = combined_df.drop(combined_df.filter(regex = r'.*_y').columns, axis = 1)
combined_df_clean.rename(columns=lambda x: re.sub('_x$','', x), inplace = True)
return combined_df_clean
#%% end of function
#=======================================================================

View file

@ -1,287 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
'''
Created on Tue Aug 6 12:56:03 2019
@author: tanu
'''
# FIXME: change filename 2(mcsm normalised data)
# to be consistent like (pnca_complex_mcsm_norm.csv) : changed manually, but ensure this is done in the mcsm pipeline
#=======================================================================
# Task: combine 2 dfs with aa position as linking column
# Input: 2 dfs
# <gene.lower()>_complex_mcsm_norm.csv
# <gene.lower()>_foldx.csv
# Output: .csv of all 2 dfs combined
# useful link
# https://stackoverflow.com/questions/23668427/pandas-three-way-joining-multiple-dataframes-on-columns
#=======================================================================
#%% load packages
import sys, os
import pandas as pd
import numpy as np
#from varname import nameof
import argparse
#=======================================================================
#%% specify input and curr dir
homedir = os.path.expanduser('~')
# set working dir
os.getcwd()
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
os.getcwd()
# FIXME: local imports
#from combining import combine_dfs_with_checks
from combining_FIXME import detect_common_cols
#=======================================================================
#%% command line args
#arg_parser = argparse.ArgumentParser()
#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide')
#arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive
#args = arg_parser.parse_args()
#=======================================================================
#%% variable assignment: input and output
drug = 'pyrazinamide'
gene = 'pncA'
gene_match = gene + '_p.'
#drug = args.drug
#gene = args.gene
#======
# dirs
#======
datadir = homedir + '/' + 'git/Data'
indir = datadir + '/' + drug + '/' + 'input'
outdir = datadir + '/' + drug + '/' + 'output'
#=======
# input
#=======
in_filename_mcsm = gene.lower() + '_complex_mcsm_norm.csv'
in_filename_foldx = gene.lower() + '_foldx.csv'
in_filename_dssp = gene.lower() + '_dssp.csv'
in_filename_kd = gene.lower() + '_kd.csv'
in_filename_rd = gene.lower() + '_rd.csv'
in_filename_snpinfo = 'ns' + gene.lower() + '_snp_info.csv'
in_filename_afor = gene.lower() + '_af_or.csv'
in_filename_afor_kin = gene.lower() + '_af_or_kinship.csv'
infile_mcsm = outdir + '/' + in_filename_mcsm
infile_foldx = outdir + '/' + in_filename_foldx
infile_dssp = outdir + '/' + in_filename_dssp
infile_kd = outdir + '/' + in_filename_kd
infile_rd = outdir + '/' + in_filename_rd
infile_snpinfo = indir + '/' + in_filename_snpinfo
infile_afor = outdir + '/' + in_filename_afor
infile_afor_kin = outdir + '/' + in_filename_afor_kin
print('\nInput path:', outdir
, '\nInput filename mcsm:', infile_mcsm
, '\nInput filename foldx:', infile_foldx
, '\nInput filename dssp:', infile_dssp
, '\nInput filename kd:', infile_kd
, '\nInput filename rd', infile_rd
, '\nInput filename snp info:', infile_snpinfo
, '\nInput filename af or:', infile_afor
, '\nInput filename afor kinship:', infile_afor_kin
, '\n============================================================')
#=======
# output
#=======
out_filename_comb = gene.lower() + '_all_params.csv'
outfile_comb = outdir + '/' + out_filename_comb
print('Output filename:', outfile_comb
, '\n============================================================')
o_join = 'outer'
l_join = 'left'
r_join = 'right'
i_join = 'inner'
# end of variable assignment for input and output files
#&%%====================================================================
mcsm_df = pd.read_csv(infile_mcsm, sep = ',')
mcsm_df.columns = mcsm_df.columns.str.lower()
foldx_df = pd.read_csv(infile_foldx , sep = ',')
print('==================================='
, '\nFirst merge: mcsm + foldx'
, '\n===================================')
#mcsm_foldx_dfs = combine_dfs_with_checks(mcsm_df, foldx_df, my_join = o_join)
merging_cols_m1 = detect_common_cols(mcsm_df, foldx_df)
mcsm_foldx_dfs = pd.merge(mcsm_df, foldx_df, on = merging_cols_m1, how = 'outer')
ncols_m1 = len(mcsm_foldx_dfs.columns)
#%%~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
print('==================================='
, '\nSecond merge: dssp + kd'
, '\n===================================')
dssp_df = pd.read_csv(infile_dssp, sep = ',')
kd_df = pd.read_csv(infile_kd, sep = ',')
rd_df = pd.read_csv(infile_rd, sep = ',')
#dssp_kd_dfs = combine_dfs_with_checks(dssp_df, kd_df, my_join = o_join)
merging_cols_m2 = detect_common_cols(dssp_df, kd_df)
dssp_kd_dfs = pd.merge(dssp_df, kd_df, on = merging_cols_m2, how = 'outer')
print('==================================='
, '\nThird merge: dssp_kd_dfs + rd_df'
, '\n===================================')
#dssp_kd_rd_dfs = combine_dfs_with_checks(dssp_kd_dfs, rd_df, my_join = o_join)
merging_cols_m3 = detect_common_cols(dssp_df, kd_df)
dssp_kd_rd_dfs = pd.merge(dssp_kd_dfs, rd_df, on = merging_cols_m3, how = 'outer')
ncols_m3 = len(dssp_kd_rd_dfs.columns)
#%%~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
print('==================================='
, '\nFourth merge: First merge + Third merge'
, '\n===================================')
#combined_dfs = combine_dfs_with_checks(mcsm_foldx_dfs, dssp_kd_rd_dfs, my_join = i_join)# gives wrong!
merging_cols_m4 = detect_common_cols(mcsm_foldx_dfs, dssp_kd_rd_dfs)
combined_df_expected_cols = ncols_m1 + ncols_m3 - len(merging_cols_m4)
combined_df = pd.merge(mcsm_foldx_dfs, dssp_kd_rd_dfs, on = merging_cols_m4, how = 'inner')
if len(combined_df) == len(mcsm_df) and len(combined_df.columns) == combined_df_expected_cols:
print('PASS: successfully combined 5 dfs'
, '\nnrows combined_df:', len(combined_df)
, '\ncols combined_df:', len(combined_df.columns))
else:
sys.exit('FAIL: check individual df merges')
#%%~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#%%~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#%% OR combining
afor_df = pd.read_csv(infile_afor, sep = ',')
afor_df.columns = afor_df.columns.str.lower()
if afor_df['mutation'].shape[0] == afor_df['mutation'].nunique():
print('No duplicate muts detected in afor_df')
else:
print('Dropping duplicate muts detected in afor_df')
afor_df = afor_df.drop_duplicates(subset = 'mutation', keep = 'first')
snpinfo_df_all = pd.read_csv(infile_snpinfo, sep = ',')
snpinfo_df = snpinfo_df_all[['mutation', 'mutationinformation']]
if snpinfo_df['mutation'].shape[0] == snpinfo_df['mutation'].nunique():
print('No duplicate muts detected in snpinfo_df')
else:
dups = snpinfo_df['mutation'].duplicated().sum()
print( dups, 'Duplicate muts detected in snpinfo_df'
, '\nDim:', snpinfo_df.shape)
print('Dropping duplicate muts')
snpinfo_df = snpinfo_df.drop_duplicates(subset = 'mutation', keep = 'first')
print('Dim:', snpinfo_df.shape)
print('==================================='
, '\nFifth merge: afor_df + snpinfo_df'
, '\n===================================')
merging_cols_m5 = detect_common_cols(afor_df, snpinfo_df)
afor_snpinfo_dfs = pd.merge(afor_df, snpinfo_df, on = merging_cols_m5, how = 'left')
if len(afor_snpinfo_dfs) == afor_df.shape[0]:
print('PASS: succesfully combined with left join'
, '\nDim of df1:', afor_df.shape
, '\nDim of df2:', snpinfo_df.shape)
else:
sys.exit('FAIL: unsuccessful merge')
#%%~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
afor_kin_df = pd.read_csv(infile_afor_kin, sep = ',')
afor_kin_df.columns = afor_kin_df.columns.str.lower()
print('==================================='
, '\nSixth merge: afor_snpinfo_dfs + afor_kin_df'
, '\n===================================')
merging_cols_m6 = detect_common_cols(afor_snpinfo_dfs, afor_kin_df)
print('Dim of df1:', afor_snpinfo_dfs.shape
, '\nDim of df2:', afor_kin_df.shape
, '\nno. of merging_cols:', len(merging_cols_m6))
ors_df = pd.merge(afor_snpinfo_dfs, afor_kin_df, on = merging_cols_m6, how = 'outer')
print('Dim of ors_df:', ors_df.shape)
#%%~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
print('==================================='
, '\nSeventh merge: combined_df + ors_df'
, '\n===================================')
merging_cols_m7 = detect_common_cols(combined_df, ors_df)
print('Dim of df1:', combined_df.shape
, '\nDim of df2:', ors_df.shape
, '\nno. of merging_cols:', len(merging_cols_m7))
print('checking mutations in the two dfs:'
, '\nmuts in df1 but NOT in df2:'
, combined_df['mutationinformation'].isin(ors_df['mutationinformation']).sum()
, '\nmuts in df2 but NOT in df1:'
, ors_df['mutationinformation'].isin(combined_df['mutationinformation']).sum())
#print('\nNo. of common muts:', np.intersect1d(combined_df['mutationinformation'], ors_df['mutationinformation']) )
#combined_df_all = pd.merge(combined_df, ors_df, on = merging_cols_m7, how = 'outer') # FIXME
combined_df_all = pd.merge(combined_df, ors_df, on = merging_cols_m7, how = 'left')
outdf_expected_rows = len(combined_df)
outdf_expected_cols = len(combined_df.columns) + len(ors_df.columns) - len(merging_cols_m7)
print('\nDim of combined_df_all:', combined_df_all.shape
, '\nwith join type: ????')
if combined_df_all.shape[1] == outdf_expected_cols:
print('combined_df has expected no. of cols')
if combined_df_all.shape[0] == outdf_expected_rows:
print('combined_df has expected no. of rows')
else:
print('WARNING: nrows discrepancy noted'
, '\nFIX IT')
print ('thing finished')
#%%~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# write csv
combined_df_all.to_csv(outfile_comb, index = False)
#=======================================================================
#%% incase you FIX the the function: combine_dfs_with_checks
#def main():
# print('Reading input files:')
#mcsm_df = pd.read_csv(infile_mcsm, sep = ',')
#mcsm_df.columns = mcsm_df.columns.str.lower()
#foldx_df = pd.read_csv(infile_foldx , sep = ',')
#dssp_df = pd.read_csv(infile_dssp, sep = ',')
#dssp_df.columns = dssp_df.columns.str.lower()
#kd_df = pd.read_csv(infile_kd, sep = ',')
#kd_df.columns = kd_df.columns.str.lower()
#rd_df = pd.read_csv(infile_kd, sep = ',')
#if __name__ == '__main__':
# main()
#=======================================================================
#%% end of script

View file

@ -1,9 +0,0 @@
#!/usr/bin/env Rscript
print('R Argument Test')
cmd <- paste(commandArgs(), collapse=" ")
cat("How R was invoked:\n");
cat(cmd, "\n")
args <- commandArgs(trailingOnly = TRUE)
cat(c('Command Line Arguments supplied: ', args))

View file

@ -21,6 +21,9 @@ Created on Tue Aug 6 12:56:03 2019
# where each row is a separate mutation # where each row is a separate mutation
# sample ids AND mutations are NOT unique, but the COMBINATION (sample id + mutation) = unique # sample ids AND mutations are NOT unique, but the COMBINATION (sample id + mutation) = unique
# NOTE
#drtype is renamed to 'resistance' in the 35k dataset
# output files: all lower case # output files: all lower case
# 0) <gene>_common_ids.csv # 0) <gene>_common_ids.csv
# 1) <gene>_ambiguous_muts.csv # 1) <gene>_ambiguous_muts.csv
@ -60,6 +63,7 @@ os.getcwd()
# import aa dict # import aa dict
from reference_dict import my_aa_dict # CHECK DIR STRUC THERE! from reference_dict import my_aa_dict # CHECK DIR STRUC THERE!
from tidy_split import tidy_split
#======================================================================= #=======================================================================
#%% command line args #%% command line args
arg_parser = argparse.ArgumentParser() arg_parser = argparse.ArgumentParser()
@ -96,8 +100,8 @@ datadir = homedir + '/' + 'git/Data'
#======= #=======
# input # input
#======= #=======
in_filename = 'original_tanushree_data_v2.csv' #in_filename = 'original_tanushree_data_v2.csv' #19k
#in_filename = 'mtb_gwas_v3.csv' in_filename = 'mtb_gwas_meta_v3.csv' #33k
infile = datadir + '/' + in_filename infile = datadir + '/' + in_filename
print('Input file: ', infile print('Input file: ', infile
, '\n============================================================') , '\n============================================================')
@ -121,17 +125,45 @@ master_data = pd.read_csv(infile, sep = ',')
#list(master_data.columns) #list(master_data.columns)
# extract elevant columns to extract from meta data related to the drug # extract elevant columns to extract from meta data related to the drug
meta_data = master_data[['id'
,'country'
,'lineage'
,'sublineage'
,'drtype'
, drug
, dr_muts_col
, other_muts_col
]]
del(master_data) #meta_data_ch = master_data[['id'
#, 'country'
#, 'lineage'
#, 'sublineage'
##, 'drtype' #19k only
#, 'resistance'
#, drug
#, dr_muts_col
#, other_muts_col]]
core_cols = ['id'
, 'country'
, 'country2'
, 'geographic_source'
, 'region'
, 'date'
, 'strain'
, 'lineage'
, 'sublineage' #drtype renamed to resistance
, 'resistance'
, 'location'
, 'host_body_site'
, 'environment_material'
, 'host_status'
, 'hiv_status'
, 'HIV_status'
, 'isolation_source']
variable_based_cols = [drug
, dr_muts_col
, other_muts_col]
cols_to_extract = core_cols + variable_based_cols
meta_data = master_data[cols_to_extract]
del(master_data, variable_based_cols, cols_to_extract)
# checks and results # checks and results
total_samples = meta_data['id'].nunique() total_samples = meta_data['id'].nunique()
@ -269,14 +301,23 @@ print('gene to extract:', gene_match )
#=============== #===============
# FIXME: replace drug with variable containing the drug name # FIXME: replace drug with variable containing the drug name
# !!! important !!! # !!! important !!!
meta_data_dr = meta_data[['id' #meta_data_dr = meta_data[['id'
,'country' # ,'country'
,'lineage' # ,'lineage'
,'sublineage' # ,'sublineage'
,'drtype' # ,'drtype'
, drug # , drug
, dr_muts_col # , dr_muts_col
]] # ]]
dr_based_cols = [drug, dr_muts_col]
cols_to_extract = core_cols + dr_based_cols
meta_data_dr = meta_data[cols_to_extract]
del(dr_based_cols, cols_to_extract)
print('expected dim should be:', len(meta_data), (len(meta_data.columns)-1) ) print('expected dim should be:', len(meta_data), (len(meta_data.columns)-1) )
print('actual dim:', meta_data_dr.shape print('actual dim:', meta_data_dr.shape
, '\n===============================================================') , '\n===============================================================')
@ -306,14 +347,22 @@ dr_id = pd.Series(dr_id)
print('Extracting dr_muts from:', other_muts_col,'with other meta_data') print('Extracting dr_muts from:', other_muts_col,'with other meta_data')
# FIXME: replace drug with variable containing the drug name # FIXME: replace drug with variable containing the drug name
# !!! important !!! # !!! important !!!
meta_data_other = meta_data[['id' #meta_data_other = meta_data[['id'
,'country' # ,'country'
,'lineage' # ,'lineage'
,'sublineage' # ,'sublineage'
,'drtype' ## ,'drtype'
, drug # , drug
, other_muts_col # , other_muts_col
]] # ]]
dr_based_cols = [drug, other_muts_col]
cols_to_extract = core_cols + dr_based_cols
meta_data_other = meta_data[cols_to_extract]
del(dr_based_cols, cols_to_extract)
print('expected dim should be:', len(meta_data), (len(meta_data.columns)-1) ) print('expected dim should be:', len(meta_data), (len(meta_data.columns)-1) )
print('actual dim:', meta_data_other.shape print('actual dim:', meta_data_other.shape
@ -373,7 +422,7 @@ print('Writing file:'
, '\nExpected no. of rows:', len(common_ids) , '\nExpected no. of rows:', len(common_ids)
, '\n=============================================================') , '\n=============================================================')
common_ids.to_csv(outfile0) common_ids.to_csv(outfile0, index = False)
del(out_filename0) del(out_filename0)
# clear variables # clear variables
@ -419,44 +468,15 @@ print('This is still dirty data: samples have ', gene_match, 'muts but may have
#https://stackoverflow.com/questions/41476150/removing-space-from-dataframe-columns-in-pandas #https://stackoverflow.com/questions/41476150/removing-space-from-dataframe-columns-in-pandas
print('Performing tidy_split(): to separate the mutations into indivdual rows') print('Performing tidy_split(): to separate the mutations into indivdual rows')
# define the split function
def tidy_split(df, column, sep='|', keep=False):
'''
Split the values of a column and expand so the new DataFrame has one split
value per row. Filters rows where the column is missing.
Params
------
df : pandas.DataFrame
dataframe with the column to split and expand
column : str
the column to split and expand
sep : str
the string used to split the column's values
keep : bool
whether to retain the presplit value as it's own row
Returns
-------
pandas.DataFrame
Returns a dataframe with the same columns as `df`.
'''
indexes = list()
new_values = list()
#df = df.dropna(subset=[column])#!!!!-----see this incase you need to uncomment based on use case
for i, presplit in enumerate(df[column].astype(str)):
values = presplit.split(sep)
if keep and len(values) > 1:
indexes.append(i)
new_values.append(presplit)
for value in values:
indexes.append(i)
new_values.append(value)
new_df = df.iloc[indexes, :].copy()
new_df[column] = new_values
return new_df
#%% end of tidy_split() #TIDY SPLIT HERE
#========= #=========
# DF1: dr_muts_col # DF1: dr_muts_col
#========= #=========
@ -761,12 +781,11 @@ del(c1, c2, col_to_split1, col_to_split2, comp_gene_samples, dr_WF0, dr_df, dr_m
out_filename1 = gene.lower() + '_ambiguous_muts.csv' out_filename1 = gene.lower() + '_ambiguous_muts.csv'
outfile1 = outdir + '/' + out_filename1 outfile1 = outdir + '/' + out_filename1
print('Writing file: ambiguous muts' print('Writing file: ambiguous muts'
, '\nFilename:', out_filename1 , '\nFilename:', outfile1)
, '\nPath:', outdir)
#common_muts = ['gene_matchVal180Phe','gene_matchGln10Pro'] # test #common_muts = ['gene_matchVal180Phe','gene_matchGln10Pro'] # test
inspect = gene_LF1[gene_LF1['mutation'].isin(common_muts)] inspect = gene_LF1[gene_LF1['mutation'].isin(common_muts)]
inspect.to_csv(outfile1) inspect.to_csv(outfile1, index = False)
print('Finished writing:', out_filename1 print('Finished writing:', out_filename1
, '\nNo. of rows:', len(inspect) , '\nNo. of rows:', len(inspect)
@ -1069,13 +1088,13 @@ else:
print('FAIL: SNP has NA, Possible mapping issues from dict?' print('FAIL: SNP has NA, Possible mapping issues from dict?'
, '\nDebug please!' , '\nDebug please!'
, '\n=========================================================') , '\n=========================================================')
sys.exit()
out_filename2 = gene.lower() + '_mcsm_snps.csv' out_filename2 = gene.lower() + '_mcsm_snps.csv'
outfile2 = outdir + '/' + out_filename2 outfile2 = outdir + '/' + out_filename2
print('Writing file: mCSM style muts' print('Writing file: mCSM style muts'
, '\nFilename:', out_filename2 , '\nFilename:', outfile2
, '\nPath:', outdir
, '\nmutation format (SNP): {WT}<POS>{MUT}' , '\nmutation format (SNP): {WT}<POS>{MUT}'
, '\nNo. of distinct muts:', len(snps_only) , '\nNo. of distinct muts:', len(snps_only)
, '\nNo. of distinct positions:', len(pos_only) , '\nNo. of distinct positions:', len(pos_only)
@ -1083,7 +1102,7 @@ print('Writing file: mCSM style muts'
snps_only.to_csv(outfile2, header = False, index = False) snps_only.to_csv(outfile2, header = False, index = False)
print('Finished writing:', out_filename2 print('Finished writing:', outfile2
, '\nNo. of rows:', len(snps_only) , '\nNo. of rows:', len(snps_only)
, '\nNo. of cols:', len(snps_only.columns) , '\nNo. of cols:', len(snps_only.columns)
, '\n=============================================================') , '\n=============================================================')
@ -1099,7 +1118,7 @@ print('Writing file: LF formatted data'
, '\n============================================================') , '\n============================================================')
gene_LF1.to_csv(outfile3, header = True, index = False) gene_LF1.to_csv(outfile3, header = True, index = False)
print('Finished writing:', out_filename3 print('Finished writing:', outfile3
, '\nNo. of rows:', len(gene_LF1) , '\nNo. of rows:', len(gene_LF1)
, '\nNo. of cols:', len(gene_LF1.columns) , '\nNo. of cols:', len(gene_LF1.columns)
, '\n=============================================================') , '\n=============================================================')
@ -1118,11 +1137,11 @@ all_muts_msa.columns.dtype
all_muts_msa_sorted = all_muts_msa.sort_values(by = 'mutationinformation') all_muts_msa_sorted = all_muts_msa.sort_values(by = 'mutationinformation')
# create an extra column with protein name # create an extra column with protein name
all_muts_msa_sorted = all_muts_msa_sorted.assign(fasta_name = '3PL1') #all_muts_msa_sorted = all_muts_msa_sorted.assign(fasta_name = '3PL1')
all_muts_msa_sorted.head() #all_muts_msa_sorted.head()
# rearrange columns so the fasta name is the first column (required for mutate.script) # rearrange columns so the fasta name is the first column (required for mutate.script)
all_muts_msa_sorted = all_muts_msa_sorted[['fasta_name', 'mutationinformation']] #all_muts_msa_sorted = all_muts_msa_sorted[['fasta_name', 'mutationinformation']]
all_muts_msa_sorted.head() all_muts_msa_sorted.head()
print('Checking NA in snps...')# should be 0 print('Checking NA in snps...')# should be 0
@ -1138,15 +1157,14 @@ out_filename4 = gene.lower() +'_all_muts_msa.csv'
outfile4 = outdir + '/' + out_filename4 outfile4 = outdir + '/' + out_filename4
print('Writing file: mCSM style muts for msa', print('Writing file: mCSM style muts for msa',
'\nFilename:', out_filename4, '\nFilename:', outfile4,
'\nPath:', outdir,
'\nmutation format (SNP): {WT}<POS>{MUT}', '\nmutation format (SNP): {WT}<POS>{MUT}',
'\nNo.of lines of msa:', len(all_muts_msa), '\nNo.of lines of msa:', len(all_muts_msa),
) )
all_muts_msa_sorted.to_csv(outfile4, header = False, index = False) all_muts_msa_sorted.to_csv(outfile4, header = False, index = False)
print('Finished writing:', out_filename4 print('Finished writing:', outfile4
, '\nNo. of rows:', len(all_muts_msa) , '\nNo. of rows:', len(all_muts_msa)
, '\nNo. of cols:', len(all_muts_msa.columns) , '\nNo. of cols:', len(all_muts_msa.columns)
, '\n=============================================================') , '\n=============================================================')
@ -1177,7 +1195,7 @@ print('Writing file: mutational positions'
pos_only_sorted.to_csv(outfile5, header = True, index = False) pos_only_sorted.to_csv(outfile5, header = True, index = False)
print('Finished writing:', out_filename5 print('Finished writing:', outfile5
, '\nNo. of rows:', len(pos_only_sorted) , '\nNo. of rows:', len(pos_only_sorted)
, '\nNo. of cols:', len(pos_only_sorted.columns) , '\nNo. of cols:', len(pos_only_sorted.columns)
, '\n=============================================================') , '\n=============================================================')

View file

@ -1,218 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 7 09:30:16 2020
@author: tanu
"""
#=======================================================================
# TASK:
#=======================================================================
#%% load packages
import sys, os
import argparse
import re
import pandas as pd
from Bio.PDB import PDBParser
from Bio.PDB.DSSP import DSSP
import dms_tools2
import dms_tools2.dssp
import pprint as pp
#=======================================================================
#%% specify homedir and curr dir
homedir = os.path.expanduser('~')
# set working dir
os.getcwd()
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
os.getcwd()
#=======================================================================
#%% command line args
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('-d', '--drug', help='drug name', default = None)
arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = None) # case sensitive
args = arg_parser.parse_args()
#=======================================================================
#%% variable assignment: input and output
#drug = 'pyrazinamide'
#gene = 'pncA'
#gene_match = gene + '_p.'
#drug = 'isoniazid'
#gene = 'katG'
#drug = 'cycloserine'
#gene = 'alr'
drug = args.drug
gene = args.gene
gene_match = gene + '_p.'
#==========
# data dir
#==========
#indir = 'git/Data/pyrazinamide/input/original'
datadir = homedir + '/' + 'git/Data'
#=======
# input
#=======
#indir = datadir + '/' + drug + '/' + 'output'
indir = datadir + '/' + drug + '/' + 'input'
in_filename = gene.lower() + '_complex' + '.pdb'
#in_filename = 'katg_complex.pdb' # fixme for pnca(consistent filenames i.e pnca_complex.pdb)
infile = indir + '/' + in_filename
#=======
# output
#=======
outdir = datadir + '/' + drug + '/' + 'output'
print('Output path:', outdir)
#out_filename = os.path.splitext(in_filename)[0]+'.dssp' # strip file ext
dssp_filename = gene.lower() + '.dssp'
dssp_file = outdir + '/' + dssp_filename
print('Output dssp:', dssp_file)
dsspcsv_filename = gene.lower() + '_dssp.csv'
dsspcsv_file = outdir + '/' + dsspcsv_filename
print('Outfile dssp to csv: ', dsspcsv_file
, '\n=============================================================')
#%% end of variable assignment for input and output files
#=======================================================================
#%% create .dssp from pdb
def dssp_file_from_pdb(inputpdbfile, outfile, DSSP = "dssp"):
"""
Create a DSSP file from a PDB file
@param inputpdbfile: pdb file
@type inputpdbfile: string
@param outfile: dssp file
@type outfile: string
@param DSSP: DSSP executable (argument to os.system)
@type DSSP: string
@return: none, creates dssp file
"""
# out_file = infile +'.dssp'
# outfile = os.path.splitext(inputpdbfile)[0]+'.dssp' # strip file ext
os.system("%s -i %s -o %s" % (DSSP, inputpdbfile, outfile))
#=======================================================================
#%% extract chain id from dssp
#print(dssp.keys())
#print(dssp.keys()[0][0])
#print(len(dssp))
#print(dssp.keys()[0][0])
#print(dssp.keys()[len(dssp)-1][0])
def extract_chain_dssp(inputpdbfile):
"""
extracts chain_ids from dssp run on pdb file
This is to allow processing of dssp output to df
and for writing as csv file
Parameters
----------
@param inputpdbfile: pdb file
@type inputpdbfile: string
Returns
-------
@return: chain_ids from running dssp on pdb file
@type list
"""
p = PDBParser()
structure = p.get_structure(in_filename, infile)
model = structure[0]
dssp = DSSP(model, infile)
dssp_chains = []
for num_aa in range(0, len(dssp)):
# print(num_aa)
# extract the chain id only and append to a list
dssp_chains.append(dssp.keys()[num_aa][0])
chainsL = list(set(dssp_chains))
print(chainsL)
# sort the list (since sets are not ordered) for convenience
# this will be required for dssp_df
pdbchainlist = sorted(chainsL)
print('dssp output for'
, in_filename, 'contains:', len(pdbchainlist)
, 'chains:\n', pdbchainlist)
return pdbchainlist
#=======================================================================
#%% write csv of processed dssp output
def dssp_to_csv(inputdsspfile, outfile, pdbchainlist = ['A']):
"""
Create a df from a dssp file containing ASA, RSA, SS for all chains
@param infile: dssp file
@type infile: string
@param outfile: csv file
@type outfile: string
@param DSSP: DSSP to df processing using dmstools
@type DSSP: string
@return: none, creates csv file
"""
dssp_df = pd.DataFrame()
print('Total no. of chains: ', len(pdbchainlist))
for chain_id in pdbchainlist:
print('Chain id:', chain_id)
dssp_cur = pd.DataFrame()
dssp_cur = dms_tools2.dssp.processDSSP(inputdsspfile, chain = chain_id)
#!!!Important!!!
dssp_cur['chain_id'] = chain_id
dssp_df = dssp_df.append(dssp_cur)
pp.pprint(dssp_df)
# Rename column (amino acid) as 'wild_type' and (site} as 'position'
# to be the same names as used in the file required for merging later.
dssp_df.columns
dssp_df.rename(columns = {'site':'position', 'amino_acid':'wild_type_dssp'}, inplace = True)
dssp_df.columns
# sanity check
# if len(dssp_df) == len(dssp):
# print('PASS: length of dssp_df has correct length')
# else:
# print('FAIL: length mismatch for dssp_df'
# , '\nexpected length:', len(dssp)
# , '\nGot length:', len(dssp_df)
# , 'Debug please!')
# write to csv
dssp_df.to_csv(outfile, header=True, index = False)
print('Finished writing:', outfile
, '\nNo. of rows:', len(dssp_df)
, '\nNo. of cols:', len(dssp_df.columns)
, '\n==============================================================')
#=======================================================================
#%% call functions
#dssp_file_from_pdb(infile, dssp_file, DSSP = "dssp")
#my_chains = extract_chain_dssp(infile)
#dssp_to_csv(dssp_file, dsspcsv_file, my_chains)
#%%
#=======================================================================
def main():
print('Running dssp with the following params:\n'
, in_filename
, 'outfile:', dsspcsv_filename)
dssp_file_from_pdb(infile, dssp_file, DSSP = "dssp")
my_chains = extract_chain_dssp(infile)
dssp_to_csv(dssp_file, dsspcsv_file, my_chains)
if __name__ == '__main__':
main()
#%% end of script
#=======================================================================
#=======================================================================

View file

@ -1,99 +0,0 @@
#!/usr/bin/env python3
import pandas as pd
DEBUG = False
#%%
#def find_missense(test_df, ref_allele1, alt_allele0):
def find_missense(test_df, ref_allele_column, alt_allele_column, n_diff_colname = 'n_diff', tot_diff_colname = 'tot_diff', ref_a_colname = 'ref_allele', alt_a_colname = 'alt_allele'):
"""Find mismatches in pairwise comparison of strings b/w col_a and col_b
Case insensitive, converts strings to uppercase before comparison
@test_df: df containing columns to compare
@type: pandas df
@ref_allele_column: column containing ref allele str
@type: str (converts to uppercase)
@alt_allele_column: column containing alt_allele str
@type: str (converts to uppercase)
@n_diff_colname: user defined colname for no. of char diff b/w ref_allele_str and alt_allele_str
@type: str
@tot_diff_colname: user defined colname abs diff to indicate if strings are of equal length
@type: str
@ref_a_colname: user defined colname containing extracted referece allele
@type: str
@alt_a_colname: user defined colname containing extracted alt allele
@type: str
returns df: with 4 columns. If column names clash, the function column
name will override original column
@rtype: pandas df
"""
for ind, val in test_df.iterrows():
if DEBUG:
print('index:', ind, 'value:', val
, '\n============================================================')
ref_a = val[ref_allele_column].upper()
alt_a = val[alt_allele_column].upper()
if DEBUG:
print('ref_allele_string:', ref_a, 'alt_allele_string:', alt_a)
difference = sum(1 for e in zip(ref_a, alt_a) if e[0] != e[1])
test_df.at[ind, n_diff_colname] = difference # adding column
tot_difference = difference + abs(len(ref_a) - len(alt_a))
test_df.at[ind, tot_diff_colname] = tot_difference # adding column
if difference != tot_difference:
print('WARNING: lengths of ref_allele and alt_allele differ at index:', ind
, '\nNon-missense muts detected')
# Now finding the mismatched char
ref_aln = ''
alt_aln = ''
if ref_a == alt_a:
##test_df.at[ind, 'ref_allele'] = 'no_change' # adding column
##test_df.at[ind, 'alt_allele'] = 'no_change' # adding column
test_df.at[ind, ref_a_colname] = 'no_change' # adding column
test_df.at[ind, alt_a_colname] = 'no_change' # adding column
elif len(ref_a) == len(alt_a) and len(ref_a) > 0:
print('ref:', ref_a, 'alt:', alt_a)
for n in range(len(ref_a)):
if ref_a[n] != alt_a[n]:
ref_aln += ref_a[n]
alt_aln += alt_a[n]
##test_df.at[ind, 'ref_allele'] = ref_aln
##test_df.at[ind, 'alt_allele'] = alt_aln
test_df.at[ind, ref_a_colname] = ref_aln
test_df.at[ind, alt_a_colname] = alt_aln
print('ref:', ref_aln)
print('alt:', alt_aln)
else:
##test_df.at[ind, 'ref_allele'] = 'ERROR_Not_nsSNP'
##test_df.at[ind, 'alt_allele'] = 'ERROR_Not_nsSNP'
test_df.at[ind, ref_a_colname] = 'ERROR_Not_nsSNP'
test_df.at[ind, alt_a_colname] = 'ERROR_Not_nsSNP'
return test_df
#========================================
# a representative example
eg_df = {'chromosome_number': [2288719, 2288766, 2288775, 2288779, 2288827, 1111111, 2222222],
'ref_allele1': ['Tc', 'AG', 'AGCACCCTG', 'CCCTGGTGGCC', 'CACA', 'AA', 'CAT'],
'alt_allele0': ['CC', 'CA', 'GGCACCCTGZ','TCCTGGTGGCCAAD', 'TACA', 'AA', 'TCZ']}
# snippet of actual data
#eg_df = pd.read_csv('pnca_assoc.txt', sep = '\t', nrows = 10, header = 0, index_col = False)
eg_df = pd.DataFrame(eg_df)
def main():
#find_missense(eg_df, ref_allele1 = 'ref_allele', alt_allele0 = 'alt_allele')
find_missense(test_df = eg_df, ref_allele_column = 'ref_allele1', alt_allele_column = 'alt_allele0')
print(eg_df)
if __name__ == '__main__':
main()

View file

@ -1,14 +0,0 @@
#!/usr/bin/env Rscript
require('getopt', quietly=TRUE)
spec = matrix(c(
"drug" , "d", 1, "character",
"gene" , "g", 1, "character"
), byrow=TRUE, ncol=4)
opt = getopt(spec);
drug = opt$drug
gene = opt$gene
cat(c('\nDrug:', drug, '\nGene:', gene, '\n'))

View file

@ -1,230 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
'''
Created on Tue Aug 6 12:56:03 2019
@author: tanu
'''
#=======================================================================
# Task: Hydrophobicity (Kd) values for amino acid sequence using the
# Kyt&-Doolittle.
# Same output as using the expasy server (link below)
# Input: fasta file
# Output: csv file with
# useful links
# https://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParamData-pysrc.html
# https://web.expasy.org/protscale/pscale/protscale_help.html
#=======================================================================
#%% load packages
import sys, os
import argparse
import pandas as pd
import numpy as np
from pylab import *
from Bio.SeqUtils import ProtParamData
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio import SeqIO
#from Bio.Alphabet.IUPAC import IUPACProtein
import pprint as pp
#=======================================================================
#%% specify homedir and curr dir
homedir = os.path.expanduser('~')
# set working dir
os.getcwd()
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
os.getcwd()
#=======================================================================
#%% command line args
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('-d', '--drug', help='drug name', default = None)
arg_parser.add_argument('-g', '--gene', help='gene name', default = None)
#arg_parser.add_argument('-p', '--plot', help='show plot', action='store_true')
args = arg_parser.parse_args()
#=======================================================================
#%% variable assignment: input and output
#drug = 'pyrazinamide'
#gene = 'pncA'
drug = args.drug
gene = args.gene
#plot = args.plot
gene_match = gene + '_p.'
#==========
# data dir
#==========
datadir = homedir + '/' + 'git/Data'
#=======
# input
#=======
indir = datadir + '/' + drug + '/' + 'input'
in_filename = '3pl1.fasta.txt'
infile = indir + '/' + in_filename
print('Input filename:', in_filename
, '\nInput path:', indir
, '\n============================================================')
#=======
# output
#=======
outdir = datadir + '/' + drug + '/' + 'output'
out_filename = gene.lower() + '_kd.csv'
outfile = outdir + '/' + out_filename
print('Output filename:', out_filename
, '\nOutput path:', outdir
, '\n=============================================================')
#%% end of variable assignment for input and output files
#=======================================================================
#%% kd values from fasta file and output csv
def kd_to_csv(inputfasta, outputkdcsv, windowsize = 3):
"""
Calculate kd (hydropathy values) from input fasta file
@param inputfasta: fasta file
@type inputfasta: string
@param outputkdcsv: csv file with kd values
@type outfile: string
@param windowsize: windowsize to perform KD calcs on (Kyte&-Doolittle)
@type DSSP: numeric
@return: none, writes kd values df as csv
"""
#========================
# read input fasta file
#========================
fh = open(inputfasta)
for record in SeqIO.parse(fh, 'fasta'):
id = record.id
seq = record.seq
num_residues = len(seq)
fh.close()
sequence = str(seq)
X = ProteinAnalysis(sequence)
#===================
# calculate KD values: same as the expasy server
#===================
my_window = windowsize
offset = round((my_window/2)-0.5)
# edge weight is set to default (100%)
kd_values = (X.protein_scale(ProtParamData.kd , window = my_window))
# sanity checks
print('Sequence Length:', num_residues)
print('kd_values Length:',len(kd_values))
print('Window Length:', my_window)
print('Window Offset:', offset)
print('=================================================================')
print('Checking:len(kd values) is as expected for the given window size & offset...')
expected_length = num_residues - (my_window - offset)
if len(kd_values) == expected_length:
print('PASS: expected and actual length of kd values match')
else:
print('FAIL: length mismatch'
,'\nExpected length:', expected_length
,'\nActual length:', len(kd_values)
, '\n=========================================================')
#===================
# creating two dfs
#===================
# 1) aa sequence and 2) kd_values. Then reset index for each df
# which will allow easy merging of the two dfs.
# df1: df of aa seq with index reset to start from 1
# (reflective of the actual aa position in a sequence)
# Name column of wt as 'wild_type' to be the same name used
# in the file required for merging later.
dfSeq = pd.DataFrame({'wild_type_kd':list(sequence)})
dfSeq.index = np.arange(1, len(dfSeq) + 1) # python is not inclusive
# df2: df of kd_values with index reset to start from offset + 1 and
# subsequent matched length of the kd_values
dfVals = pd.DataFrame({'kd_values':kd_values})
dfVals.index = np.arange(offset + 1, len(dfVals) + 1 + offset)
# sanity checks
max(dfVals['kd_values'])
min(dfVals['kd_values'])
#===================
# concatenating dfs
#===================
# Merge the two on index
# (as these are now reflective of the aa position numbers): df1 and df2
# This will introduce NaN where there is missing values. In our case this
# will be 2 (first and last ones based on window size and offset)
kd_df = pd.concat([dfSeq, dfVals], axis = 1)
#============================
# renaming index to position
#============================
kd_df = kd_df.rename_axis('position')
kd_df.head
print('Checking: position col i.e. index should be numeric')
if kd_df.index.dtype == 'int64':
print('PASS: position col is numeric'
, '\ndtype is:', kd_df.index.dtype)
else:
print('FAIL: position col is not numeric'
, '\nConverting to numeric')
kd_df.index.astype('int64')
print('Checking dtype for after conversion:\n'
, '\ndtype is:', kd_df.index.dtype
, '\n=========================================================')
#===============
# writing file
#===============
print('Writing file:'
, '\nFilename:', outputkdcsv
# , '\nPath:', outdir
, '\nExpected no. of rows:', len(kd_df)
, '\nExpected no. of cols:', len(kd_df.columns)
, '\n=============================================================')
kd_df.to_csv(outputkdcsv, header = True, index = True)
#===============
# plot: optional!
#===============
# http://www.dalkescientific.com/writings/NBN/plotting.html
# FIXME: save fig
# extract just pdb if from 'id' to pass to title of plot
# foo = re.match(r'(^[0-9]{1}\w{3})', id).groups(1)
# if doplot:
plot(kd_values, linewidth = 1.0)
#axis(xmin = 1, xmax = num_residues)
xlabel('Residue Number')
ylabel('Hydrophobicity')
title('K&D Hydrophobicity for ' + id)
show()
#%% end of function
#=======================================================================
#%% call function
#kd_to_csv(infile, outfile, windowsize = 3)
#=======================================================================
def main():
print('Running hydropathy calcs with following params\n'
, in_filename
, '\noutfile:', out_filename)
kd_to_csv(infile, outfile, 3)
print('Finished writing file:'
, '\nFilename:', outfile
, '\n=============================================================')
if __name__ == '__main__':
main()
#%% end of script
#=======================================================================

View file

@ -1,152 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Jun 10 11:13:49 2020
@author: tanu
"""
#==============================================================================
# TASK
# To format snp_fino.txt file that has already been processed in bash
# to include mcsm style muts and gwas style muts. The idea is that the info file
# will contain all possible mutation format style to make it easy to populate
# and link other files with this sort of meta data. For example: or file
#=======================================================================
# FIXME : add bash info here as well
#%% useful links
#https://chrisalbon.com/python/data_wrangling/pandas_join_merge_dataframe/
#https://kanoki.org/2019/11/12/how-to-use-regex-in-pandas/
#https://stackoverflow.com/questions/40348541/pandas-diff-with-string
#=======================================================================
#%% specify dirs
import os, sys
import pandas as pd
import numpy as np
import re
import argparse
homedir = os.path.expanduser('~')
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
#from reference_dict import my_aa_dict
#from reference_dict import low_3letter_dict # equivalent of my_aa_dict
from reference_dict import oneletter_aa_dict
#=======================================================================
#%% command line args
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('-d', '--drug', help = 'drug name', default = 'pyrazinamide')
arg_parser.add_argument('-g', '--gene', help = 'gene name (case sensitive)', default = 'pncA') # case sensitive
args = arg_parser.parse_args()
#=======================================================================
#%% variables
#gene = 'pncA'
#drug = 'pyrazinamide'
#gene_match = gene +'_p.'
# cmd variables
gene = args.gene
drug = args.drug
gene_match = gene +'_p.'
#=======================================================================
#%% input and output dirs and files
#=======
# data dir
#=======
datadir = homedir + '/' + 'git/Data'
indir = datadir + '/' + drug + '/input'
outdir = datadir + '/' + drug + '/output'
#=======
# input
#=======
gene_info_filename = 'ns'+ gene.lower()+ '_snp_info1.txt'
gene_info = indir + '/' + gene_info_filename
print('gene info file: ', gene_info
, '\n============================================================')
#=======
# output
#=======
gene_snp_info_filename = 'ns' + gene.lower() + '_snp_info.csv' # other one is called AFandOR
outfile_snp_info = indir + '/' + gene_snp_info_filename
print('Output file: ', outfile_snp_info
, '\n============================================================')
#%% read files: preformatted using bash
info_df2 = pd.read_csv(gene_info, sep = '\t', header = 0) #303, 10
#%% Split into three cols with 1-letter aa_code & combine to get mutationinformation column
# check mutation format in exisiting df
info_df2.head()
info_df2['mut_info'].head()
print('Creating column: mutationinformation')
info_df2_ncols = len(info_df2.columns)
info_df2['wild_type'] = info_df2['mut_info'].str.extract('(\w{1})>')
info_df2['position'] = info_df2['mut_info'].str.extract('(\d+)')
info_df2['mutant_type'] = info_df2['mut_info'].str.extract('>\d+(\w{1})')
info_df2['mutationinformation'] = info_df2['wild_type'] + info_df2['position'] + info_df2['mutant_type']
# sanity check
ncols_add = 4 # Beware hardcoded
if len(info_df2.columns) == info_df2_ncols + ncols_add:
print('PASS: Succesfully extracted and added mutationinformation (mcsm style) as below\n'
, info_df2['mutationinformation'].head()
, '\n=====================================================================')
else:
print('FAIL: No. of cols mismatch'
,'\noriginal length:', info_df2_ncols
, '\nExpected no. of cols:', info_df2_ncols + ncols_add
, '\nGot no. of cols:', len(info_df2.columns))
sys.exit()
# update ncols
info_df2_ncols = len(info_df2.columns)
#%% Creating column 'mutation' which as mutation of the format;
# <gene_match>.lower()<abc>1<cde>: pnca_p.trp68gly
# match the 'one_letter_code' value to get the dict key (three-letter code)
print('Creating column: mutation')
# dict to use: oneletter_aa_dict
lookup_dict = dict()
for k1, v1 in oneletter_aa_dict.items():
lookup_dict[k1] = v1['three_letter_code_lower']
for k2, v2 in lookup_dict.items():
info_df2['wt_3let'] = info_df2['wild_type'].squeeze().map(lookup_dict)
info_df2['mt_3let'] = info_df2['mutant_type'].squeeze().map(lookup_dict)
# create column mutation
info_df2['mutation'] = info_df2['wt_3let'] + info_df2['position'] + info_df2['mt_3let']
# add prefix: gene_match to each value in column
info_df2['mutation'] = gene_match.lower() + info_df2['mutation'].astype(str)
# sanity check
ncols_add = 3 # Beware hardcoded
if len(info_df2.columns) == info_df2_ncols + ncols_add:
print('PASS: Succesfully created column mutation as below\n'
, info_df2['mutation'].head()
, '\n=====================================================================')
else:
print('FAIL: No. of cols mismatch\noriginal length:', info_df2_ncols
, '\nExpected no. of cols:', info_df2_ncols + ncols_add
, '\nGot no. of cols:', len(info_df2.columns))
sys.exit()
#%% write file
print('\n====================================================================='
, '\nWriting output file:\n', outfile_snp_info
, '\nNo.of rows:', len(info_df2)
, '\nNo. of cols:', len(info_df2.columns))
info_df2.to_csv(outfile_snp_info, index = False)

View file

@ -1,334 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Jun 10 11:13:49 2020
@author: tanu
"""
#=======================================================================
#%% useful links
#https://chrisalbon.com/python/data_wrangling/pandas_join_merge_dataframe/
#https://kanoki.org/2019/11/12/how-to-use-regex-in-pandas/
#https://stackoverflow.com/questions/40348541/pandas-diff-with-string
#=======================================================================
#%% specify dirs
import os, sys
import pandas as pd
import numpy as np
import re
import argparse
homedir = os.path.expanduser('~')
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
# local import
from find_missense import find_missense
#=======================================================================
#%% command line args
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('-d', '--drug', help = 'drug name', default = 'pyrazinamide')
arg_parser.add_argument('-g', '--gene', help = 'gene name (case sensitive)', default = 'pncA') # case sensitive
arg_parser.add_argument('-s', '--start_coord', help = 'start of coding region (cds) of gene', default = 2288681) # pnca cds
arg_parser.add_argument('-e', '--end_coord', help = 'end of coding region (cds) of gene', default = 2289241) # pnca cds
args = arg_parser.parse_args()
#=======================================================================
#%% variables
#gene = 'pncA'
#drug = 'pyrazinamide'
#start_cds = 2288681
#end_cds = 2289241
# cmd variables
gene = args.gene
drug = args.drug
start_cds = args.start_coord
end_cds = args.end_coord
#=======================================================================
#%% input and output dirs and files
#=======
# data dir
#=======
datadir = homedir + '/' + 'git/Data'
indir = datadir + '/' + drug + '/input'
outdir = datadir + '/' + drug + '/output'
#=======
# input
#=======
info_filename = 'snp_info.txt'
snp_info = datadir + '/' + info_filename
print('Info file: ', snp_info
, '\n============================================================')
gene_info_filename = 'ns'+ gene.lower()+ '_snp_info.txt'
gene_info = indir + '/' + gene_info_filename
print('gene info file: ', gene_info
, '\n============================================================')
in_filename_or = 'ns'+ gene.lower()+ '_assoc.txt'
gene_or = indir + '/' + in_filename_or
print('gene OR file: ', gene_or
, '\n============================================================')
#=======
# output
#=======
gene_or_filename = gene.lower() + '_af_or_kinship.csv' # other one is called AFandOR
outfile_or_kin = outdir + '/' + gene_or_filename
print('Output file: ', outfile_or_kin
, '\n============================================================')
#%% read files: preformatted using bash
# or file: '...assoc.txt'
or_df = pd.read_csv(gene_or, sep = '\t', header = 0, index_col = False) # 182, 12 (without filtering for missense muts, it was 212 i.e we 30 muts weren't missense)
or_df.head()
or_df.columns
#%% snp_info file: master and gene specific ones
# gene info
#info_df2 = pd.read_csv('nssnp_info_pnca.txt', sep = '\t', header = 0) #303, 10
info_df2 = pd.read_csv(gene_info, sep = '\t', header = 0) #303, 10
mis_mut_cover = (info_df2['chromosome_number'].nunique()/info_df2['chromosome_number'].count()) * 100
print('*****RESULT*****'
, '\nPercentage of MISsense mut in pncA:', mis_mut_cover
, '\n*****RESULT*****') #65.7%
# large file
#info_df = pd.read_csv('snp_info.txt', sep = '\t', header = None) #12010
info_df = pd.read_csv(snp_info, sep = '\t') #12010
#info_df.columns = ['chromosome_number', 'ref_allele', 'alt_allele', 'snp_info'] #12009, 4
info_df['chromosome_number'].nunique() #10257
mut_cover = (info_df['chromosome_number'].nunique()/info_df['chromosome_number'].count()) * 100
print('*****RESULT*****'
,'\nPercentage of mutations in pncA:', mut_cover
, '\n*****RESULT*****') #85.4%
# extract unique chr position numbers
genomic_pos = info_df['chromosome_number'].unique()
genomic_pos_df = pd.DataFrame(genomic_pos, columns = ['chr_pos'])
genomic_pos_df.dtypes
genomic_pos_min = info_df['chromosome_number'].min()
genomic_pos_max = info_df['chromosome_number'].max()
# genomic coord for pnca coding region
cds_len = (end_cds-start_cds) + 1
pred_prot_len = (cds_len/3) - 1
# mindblowing: difference b/w bitwise (&) and 'and'
# DO NOT want &: is this bit set to '1' in both variables? Is this what you want?
#if (genomic_pos_min <= start_cds) & (genomic_pos_max >= end_cds):
print('*****RESULT*****'
, '\nlength of coding region:', cds_len, 'bp'
, '\npredicted protein length:', pred_prot_len, 'aa'
, '\n*****RESULT*****')
if genomic_pos_min <= start_cds and genomic_pos_max >= end_cds:
print ('PASS: coding region for gene included in snp_info.txt')
else:
print('FAIL: coding region for gene not included in info file snp_info.txt')
sys.exit('ERROR: coding region of gene not included in the info file')
#%% Extracting ref allele and alt allele as single letters
# info_df has some of these params as more than a single letter, which means that
# when you try to merge ONLY using chromosome_number, then it messes up... and is WRONG.
# Hence the merge needs to be performed on a unique set of attributes which in our case
# would be chromosome_number, ref_allele and alt_allele
#FIXME: Turn to a function
orig_len = len(or_df.columns)
#find_missense(or_df, 'ref_allele1', 'alt_allele0')
find_missense(or_df, ref_allele_column = 'ref_allele1', alt_allele_column = 'alt_allele0')
ncols_add = 4
if len(or_df.columns) == orig_len + ncols_add:
print('PASS: Succesfuly extracted ref and alt alleles for missense muts')
else:
print('FAIL: No. of cols mismatch'
,'\noriginal length:', orig_len
, '\nExpected no. of cols:', orig_len + ncols_add
, '\nGot no. of cols:', len(or_df.columns))
sys.exit()
del(orig_len, ncols_add)
#%% TRY MERGE
# check dtypes
or_df.dtypes
info_df.dtypes
#or_df.info()
# pandas documentation where it mentions: "Pandas uses the object dtype for storing strings"
# check how many unique chr_num in info_df are in or_df
genomic_pos_df['chr_pos'].isin(or_df['chromosome_number']).sum() #144
# check how many chr_num in or_df are in info_df: should be ALL of them
or_df['chromosome_number'].isin(genomic_pos_df['chr_pos']).sum() #182
# sanity check 2
if or_df['chromosome_number'].isin(genomic_pos_df['chr_pos']).sum() == len(or_df):
print('PASS: all genomic locs in or_df have meta datain info.txt')
else:
sys.exit('FAIL: some genomic locs or_df chr number DO NOT have meta data in snp_info.txt')
#%% Perform merge
#my_join = 'inner'
#my_join = 'outer'
my_join = 'left'
#my_join = 'right'
#dfm1 = pd.merge(or_df, info_df, on ='chromosome_number', how = my_join, indicator = True) # not unique!
dfm1 = pd.merge(or_df, info_df, on = ['chromosome_number', 'ref_allele', 'alt_allele'], how = my_join, indicator = True)
dfm1['_merge'].value_counts()
# count no. of missense mutations ONLY
dfm1.snp_info.str.count(r'(missense.*)').sum()
dfm2 = pd.merge(or_df, info_df2, on = ['chromosome_number', 'ref_allele', 'alt_allele'], how = my_join, indicator = True)
dfm2['_merge'].value_counts()
# count no. of nan
dfm2['mut_type'].isna().sum()
# drop nan
dfm2_mis = dfm2[dfm2['mut_type'].notnull()]
#%% sanity check
# count no. of missense muts
if len(dfm1) - dfm1.snp_info.str.count(r'(missense.*)').sum() == dfm2['mut_type'].isna().sum():
print('PASSED: numbers cross checked'
, '\nTotal no. of missense mutations:', dfm1.snp_info.str.count(r'(missense.*)').sum()
, '\nNo. of mutations falsely assumed to be missense:', len(dfm1) - dfm1.snp_info.str.count(r'(missense.*)').sum())
# two ways to filter to get only missense muts
test = dfm1[dfm1['snp_info'].str.count('missense.*')>0]
dfm1_mis = dfm1[dfm1['snp_info'].str.match('(missense.*)') == True]
test.equals(dfm1_mis)
# drop nan
dfm2_mis = dfm2[dfm2['mut_type'].notnull()]
if dfm1_mis[['chromosome_number', 'ref_allele', 'alt_allele']].equals(dfm2_mis[['chromosome_number', 'ref_allele', 'alt_allele']]):
print('PASS: Further cross checks successful')
else:
print('FAIL: Second cross check unsuccessfull. Debug please!')
sys.exit()
#%% extract mut info into three cols
orig_len = len(dfm2_mis.columns)
dfm2_mis['wild_type'] = dfm2_mis['mut_info'].str.extract('(\w{1})>')
dfm2_mis['position'] = dfm2_mis['mut_info'].str.extract('(\d+)')
dfm2_mis['mutant_type'] = dfm2_mis['mut_info'].str.extract('>\d+(\w{1})')
dfm2_mis['mutationinformation'] = dfm2_mis['wild_type'] + dfm2_mis['position'] + dfm2_mis['mutant_type']
# sanity check
ncols_add = 4
if len(dfm2_mis.columns) == orig_len + ncols_add:
print('PASS: Succesfully extracted and added mutationinformation(mcsm style)')
else:
print('FAIL: No. of cols mismatch'
,'\noriginal length:', orig_len
, '\nExpected no. of cols:', orig_len + ncols_add
, '\nGot no. of cols:', len(dfm2_mis.columns))
sys.exit()
#%% formatting data for output
print('no of cols preformatting data:', len(dfm2_mis.columns))
#1) Add column: OR for kinship calculated from beta coeff
print('converting beta coeff to OR by exponent function\n:'
, dfm2_mis['beta'].head())
dfm2_mis['or_kin'] = np.exp(dfm2_mis['beta'])
print(dfm2_mis['or_kin'].head())
#2) rename af column
dfm2_mis.rename(columns = {'af': 'af_kin'
, 'beta': 'beta_kin'
, 'p_wald': 'pwald_kin'
, 'se': 'se_kin', 'logl_H1': 'logl_H1_kin'
, 'l_remle': 'l_remle_kin'}, inplace = True)
#3) drop some not required cols (including duplicate if you want)
#3a) drop duplicate columns
dfm2_mis2 = dfm2_mis.T.drop_duplicates().T #changes dtypes in cols, so not used
dup_cols = set(dfm2_mis.columns).difference(dfm2_mis2.columns)
print('Duplicate columns identified:', dup_cols)
dup_cols = {'alt_allele0', 'ps'} # didn't want to remove tot_diff
print('removing duplicate columns: kept one of the dup_cols i.e tot_diff')
dfm2_mis.drop(list(dup_cols), axis = 1, inplace = True)
print(dfm2_mis.columns)
#3b) other not useful columns
dfm2_mis.drop(['chromosome_text', 'chr', 'symbol', '_merge', ], axis = 1, inplace = True)
dfm2_mis.rename(columns = {'ref_allele1': 'reference_allele'}, inplace = True)
print(dfm2_mis.columns)
#4) reorder columns
orkin_linked = dfm2_mis[['mutationinformation',
'wild_type',
'position',
'mutant_type',
'chr_num_allele',
'ref_allele',
'alt_allele',
'mut_info',
'mut_type',
'gene_id',
'gene_number',
'mut_region',
'reference_allele',
'alternate_allele',
'chromosome_number',
#'afs
'af_kin',
'or_kin',
# 'ors_logistic',
# 'ors_chi_cus',
# 'ors_fisher',
'pwald_kin',
# 'pvals_logistic',
# 'pvals_fisher',
# 'ci_lb_fisher',
# 'ci_ub_fisher' ,
'beta_kin',
'se_kin',
'logl_H1_kin',
'l_remle_kin',
# 'stat_chi',
# 'pvals_chi',
'n_diff',
'tot_diff',
'n_miss']]
# sanity check after reassigning columns
if orkin_linked.shape == dfm2_mis.shape and set(orkin_linked.columns) == set(dfm2_mis.columns):
print('PASS: Successfully formatted df with rearranged columns')
else:
sys.exit('FAIL: something went wrong when rearranging columns!')
#%% write file
print('\n====================================================================='
, '\nWriting output file:\n', outfile_or_kin
, '\nNo.of rows:', len(dfm2_mis)
, '\nNo. of cols:', len(dfm2_mis.columns))
orkin_linked.to_csv(outfile_or_kin, index = False)
#%% diff b/w allele0 and 1: or_df
#https://stackoverflow.com/questions/40348541/pandas-diff-with-string
#df = or_df.iloc[[5, 15, 17, 19, 34]]
#df[['alt_allele0','ref_allele1']].ne(df[['alt_allele0','ref_allele1']].shift()).any(axis=1).astype(int)
#df[['alt_allele0','ref_allele1']].ne(df[['alt_allele0','ref_allele1']].shift()).any(axis=1).astype(int)

View file

@ -1,34 +0,0 @@
#!/usr/bin/env python3
from Bio import SeqIO
from Bio import pairwise2
from Bio.pairwise2 import format_alignment
from align import myalign
import re
import os
os.chdir('/home/tanu/git/LSHTM_analysis/scripts/examples')
def main():
"""
align ref_seq and pdb_seq
# FIXME: pass command line args i.e filename
"""
my_dict = {}
align_fastas_to_align = open('align_fastas.txt', 'r')
for record in SeqIO.parse(align_fastas_to_align,"fasta"):
myid = record.id
seq = str(record.seq)
my_dict.update({myid : seq})
my_keys = list(my_dict.keys())
my_ref_seq = my_dict[my_keys[0]]
my_pdb_seq = my_dict[my_keys[1]]
fasta_alignment = myalign(my_ref_seq, my_pdb_seq)
print(fasta_alignment)
print('class:', type(fasta_alignment))
if __name__ == '__main__':
main()

View file

@ -1,70 +0,0 @@
#!/usr/bin/env python3
# Copyright 2020, Tanushree Tunstall
# This program is distributed under General Public License v. 3. See the file
# COPYING for a copy of the license.
__description__ = \
"""
chain_extract.py
extract chain/s from pdb and saves each chain as a separate file
"""
__author__ = "Tanushree Tunstall"
__date__ = ""
#=======================================================================
import os, shutil, sys
#from pdbtools.helper import cmdline
from chain_extract import ChainExtract
from Bio.PDB import PDBParser, PDBIO, Select
#from Bio.PDB.PDBParser import PDBParser
#io = PDBIO()
import argparse
#=======================================================================
def main():
"""
Function to call if run from command line.
Example use:
pdb_chain_extract.py -f <your_pdb_file> -c <chainid1><chainid2> -p <outpath> -o <outfile>
Extracts chain 'A' by default.
"""
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('-i', '--pdb_file', help='provide pdb file', default = 'None')
arg_parser.add_argument('-c', '--chain', help='chain/s to extract without spaces.', nargs = '+', default = 'A', type = list)
arg_parser.add_argument('-p', '--out_path', help='specify output path', default = '.', type = str)
arg_parser.add_argument('-o', '--out_file', help='specify output filename. Will be used as a prefix to append chain id and pdb file extension', default = 'pdbfile', type = str)
args = arg_parser.parse_args()
# Extract chains and write each chain as a separate file
pdb_file = args.pdb_file
print('input pdb file:', pdb_file)
# type = list, makes it a list of lists. Hence extracting the list of chains.
chains = args.chain[0]
#chains = ['A','B','C']
print ('user supplied chain:', chains)
# output filename and path
outpath = args.out_path
outfile = args.out_file
# get structure
p = PDBParser(PERMISSIVE=1)
structure = p.get_structure(pdb_file, pdb_file)
print('input pdb filename:', structure.get_id())
my_chains = chains
#my_chains = ['G', 'H']
c_names = ''.join(my_chains)
print('Extracting chains:', my_chains)
pdb_chains_file = outpath + '/' + outfile + '_' + c_names + '.pdb'
io = PDBIO()
io.set_structure(structure)
io.save(pdb_chains_file, ChainExtract(my_chains))
if __name__ == "__main__":
main()

View file

@ -1,71 +0,0 @@
#!/usr/bin/env python3
# Copyright 2020, Tanushree Tunstall
# This program is distributed under General Public License v. 3. See the file
# COPYING for a copy of the license.
__description__ = \
"""
chain_splitter.py
extract chain/s from pdb and saves each chain as a separate file
"""
__author__ = "Tanushree Tunstall"
__date__ = ""
#=======================================================================
import os, shutil, sys
#from pdbtools.helper import cmdline
from chain_splitter import ChainSelect
from Bio.PDB import Select, PDBIO
from Bio.PDB.PDBParser import PDBParser
#io = PDBIO()
import argparse
#=======================================================================
def main():
"""
Function to call if run from command line.
Example use:
pdb_chain_splitter.py -f <your_pdb_file> -c <chainid1><chainid2>
Extracts chain 'A' by default.
FIXME: extract all chains from the given pdb and write them out individually
"""
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('-i', '--pdb_file', help='provide pdb file', default = 'None')
arg_parser.add_argument('-c', '--chain', help='chain/s to extract without spaces.', nargs = '+', default = 'A', type = list)
arg_parser.add_argument('-p', '--out_path', help='specify output path', default = '.', type = str)
arg_parser.add_argument('-o', '--out_file', help='specify output filename. Will be used as a prefix to append chain id and pdb file extension', default = 'pdb_file_chain', type = str)
args = arg_parser.parse_args()
# Extract chains and write each chain as a separate file
pdb_file = args.pdb_file
print('input pdb file:', pdb_file)
# type = list, makes it a list of lists. Hence extracting the list of chains.
chains = args.chain[0]
#chains = ['A','B','C']
print ('user supplied chain:', chains)
# output filename and path
outpath = args.out_path
outfile = args.out_file
# get structure
p = PDBParser(PERMISSIVE=1)
structure = p.get_structure(pdb_file, pdb_file)
print('input pdb filename:', structure.get_id())
for chain in chains:
chain = chain.upper()
print ('Extracting chain:', chain)
pdb_chain_file = outpath + '/' + outfile + '_{}.pdb'.format(chain)
io = PDBIO()
io.set_structure(structure)
io.save('{}'.format(pdb_chain_file), ChainSelect(chain))
if __name__ == "__main__":
main()

@ -1 +0,0 @@
Subproject commit 881ff8f27aaf1db4266a84fb03baad3dab552c64

View file

@ -1,70 +0,0 @@
#======================================================
# renumber pdb file based on user defined start number
#======================================================
home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_residue_renumber /home/tanu/git/Data/cycloserine/input/alr_complex_model.pdb -s 35 -r
/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_residue_renumber /home/tanu/git/Data/rifampicin/input/rpob_complex_model.pdb -s 29 -r
#======================================================
# pdb_seq.py: extract seq from structure
#======================================================
/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_seq -a /home/tanu/git/Data/ethambutol/input/3byw.pdb > 3byw_seq.txt
#/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_seq -c A -a /home/tanu/git/Data/ethambutol/input/3byw.pdb > 3byw_seq.txt
======
# gidB
=======
/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_seq -a /home/tanu/git/LSHTM_3TB/gid/docking/3g89.pdb > 3g89_seq.txt
/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_seq -a /home/tanu/git/LSHTM_3TB/gid/docking/gidb_chopin1.pdb > gidb_chopin1_seq.txt
alignment
>3g89A_ATOM chain_length:238
MFGKHPGGLSERGRALLLEGGKALGLDLKPHLEAFSRLYALLQEAGEEEVVVKHFLDSLTLLRLPLWQGPLRVLDLGTGA
GFPGLPLKIVRPELELVLVDATRKKVAFVERAIEVLGLKGARALWGRAEVLAREAGHREAYARAVARAVAPLCVLSELLL
PFLEVGGAAVAMKGPRVEEELAPLPPALERLGGRLGEVLALQLPLSGEARHLVVLEKTAPTPPAYPRRPGVPERHPLC
>gidb_chopin1 _ATOM chain_length:224
MSPIEPAASAIFGPRLGLARRYAEALAGPGVERGLVGPREVGRLWDRHLLNCAVIGELLERGDRVVDIGSGAGLPGVPLA
IARPDLQVVLLEPLLRRTESLREMVTDLGVAVEIVRGRAEESWVQDQLGGSDAAVSRAVAALDKLTKWSMPLIRPNGRML
AIKGERAHDEVREHRRVMIASGAVDVRVVTCGANYLRPPATVVFARRGKQIARGSARMASGGTA
#======================================================
# pdb_mutator.py: mutate residue: FIXME, needs charm
#======================================================
/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_mutator -r 39 -m XXX /home/tanu/git/Data/ethambutol/input/3byw.pdb
#======================================================
# pdb_ligand.py: list ligands within pdb
# note: works ONLY for pdb containing ligands
# this is because such pdbs contain a field 'HETATM '
#======================================================
/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_ligand /home/tanu/git/Data/ethambutol/input/7bvf_b.pdb
/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_ligand /home/tanu/git/Data/ethambutol/input/7bvf.pdb
#======================================================
# pdb_hetatm.py: list ligands for valid pdbs AND docked complexes (my use case)
#======================================================
/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_ligand_tt /home/tanu/git/Data/cycloserine/input/alr_complex.pdb
/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_ligand_tt /home/tanu/git/Data/pyrazinamide/input/pnca_complex.pdb
/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_ligand_tt /home/tanu/git/Data/ethambutol/input/7bvf_b.pdb
/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_ligand_tt /home/tanu/git/Data/ethambutol/input/7bvf.pdb
/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_ligand_tt /home/tanu/git/Data/rifampicin/input/rpob_complex.pdb
#======================================================
# get torsion angles
#======================================================
/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_torsion /home/tanu/git/Data/rifampicin/input/rpob_complex.pdb > /home/tanu/git/Data/rifampicin/input/rpob_torsion.txt
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# my pdb tools
#======================================================
# save specifed chains as individual pdbs
#======================================================
./pdb_chain_splitter.py -i /home/tanu/git/Data/ethambutol/input/3byw.pdb -c DF -p /home/tanu/git/Data/ethambutol/input -o 3byw
#======================================================
# save specifed chains as one pdb
#======================================================
./pdb_chain_extract.py -i /home/tanu/git/Data/ethambutol/input/3byw.pdb -c DF -p /home/tanu/git/Data/ethambutol/input -o 3byw^C

View file

@ -1,12 +0,0 @@
[extractor]
data_dir = /home/tanu/git/Data
#master_file = original_tanushree_data_v2.csv
master_file = mtb_gwas_v3.csv
# Relative Paths. Per-drug paths will be created like:
#
# /home/tanu/git/Data/<drug name>/input
# /home/tanu/git/Data/<drug name>/output
input_dir = input
output_dir = output

View file

@ -1,172 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
'''
Created on Tue Aug 6 12:56:03 2019
@author: tanu
'''
#=============================================================================
# Task: Residue depth (rd) processing to generate a df with residue_depth(rd)
# values
# FIXME
# Input: '.tsv' i.e residue depth txt file (output from .zip file manually
# downloaded from the website).
# This should be integrated into the pipeline
# Output: .csv with 3 cols i.e position, rd_values & 3-letter wt aa code(caps)
#=============================================================================
#%% load packages
import sys, os
import argparse
import pandas as pd
#=============================================================================
#%% specify input and curr dir
homedir = os.path.expanduser('~')
# set working dir
os.getcwd()
os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
os.getcwd()
#=======================================================================
#%% command line args
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('-d', '--drug', help='drug name', default = None)
arg_parser.add_argument('-g', '--gene', help='gene name', default = None) # case sensitive
args = arg_parser.parse_args()
#=======================================================================
#%% variable assignment: input and output
#drug = 'pyrazinamide'
#gene = 'pncA'
drug = args.drug
gene = args.gene
gene_match = gene + '_p.'
#==========
# data dir
#==========
datadir = homedir + '/' + 'git/Data'
#=======
# input
#=======
outdir = datadir + '/' + drug + '/' + 'output'
in_filename = '3pl1_rd.tsv'
infile = outdir + '/' + in_filename
print('Input filename:', in_filename
, '\nInput path:', outdir
, '\n=============================================================')
#=======
# output
#=======
outdir = datadir + '/' + drug + '/' + 'output'
out_filename = gene.lower() + '_rd.csv'
outfile = outdir + '/' + out_filename
print('Output filename:', out_filename
, '\nOutput path:', outdir
, '\n=============================================================')
#%% end of variable assignment for input and output files
#=======================================================================
#%% rd values from <gene>_rd.tsv values
def rd_to_csv(inputtsv, outputrdcsv):
"""
Calculate kd (hydropathy values) from input fasta file
@param inputtsv: tsv file downloaded from {INSERT LINK}
@type inputtsv: string
@param outputrdsv: csv file with rd values
@type outfile: string
@return: none, writes rd values df as csv
"""
#========================
# read downloaded tsv file
#========================
#%% Read input file
rd_data = pd.read_csv(inputtsv, sep = '\t')
print('Reading input file:', inputtsv
, '\nNo. of rows:', len(rd_data)
, '\nNo. of cols:', len(rd_data.columns))
print('Column names:', rd_data.columns
, '\n===============================================================')
#========================
# creating position col
#========================
# Extracting residue number from index and assigning
# the values to a column [position]. Then convert the position col to numeric.
rd_data['position'] = rd_data.index.str.extract('([0-9]+)').values
# converting position to numeric
rd_data['position'] = pd.to_numeric(rd_data['position'])
rd_data['position'].dtype
print('Extracted residue num from index and assigned as a column:'
, '\ncolumn name: position'
, '\ntotal no. of cols now:', len(rd_data.columns)
, '\n=============================================================')
#========================
# Renaming amino-acid
# and all-atom cols
#========================
print('Renaming columns:'
, '\ncolname==> # chain:residue: wt_3letter_caps'
, '\nYES... the column name *actually* contains a # ..!'
, '\ncolname==> all-atom: rd_values'
, '\n=============================================================')
rd_data.rename(columns = {'# chain:residue':'wt_3letter_caps', 'all-atom':'rd_values'}, inplace = True)
print('Column names:', rd_data.columns)
#========================
# extracting df with the
# desired columns
#========================
print('Extracting relevant columns for writing df as csv')
rd_df = rd_data[['position','rd_values','wt_3letter_caps']]
if len(rd_df) == len(rd_data):
print('PASS: extracted df has expected no. of rows'
,'\nExtracted df dim:'
,'\nNo. of rows:', len(rd_df)
,'\nNo. of cols:', len(rd_df.columns))
else:
print('FAIL: no. of rows mimatch'
, '\nExpected no. of rows:', len(rd_data)
, '\nGot no. of rows:', len(rd_df)
, '\n=====================================================')
#===============
# writing file
#===============
print('Writing file:'
, '\nFilename:', outputrdcsv
# , '\nPath:', outdir
# , '\nExpected no. of rows:', len(rd_df)
# , '\nExpected no. of cols:', len(rd_df.columns)
, '\n=========================================================')
rd_df.to_csv(outputrdcsv, header = True, index = False)
#%% end of function
#=======================================================================
#%% call function
#rd_to_csv(infile, outfile)
#=======================================================================
def main():
print('residue depth using the following params\n'
, in_filename
, '\noutfile:', out_filename)
rd_to_csv(infile, outfile)
print('Finished Writing file:'
, '\nFilename:', outfile
, '\n=============================================================')
if __name__ == '__main__':
main()
#%% end of script
#=======================================================================

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,15 +0,0 @@
>Mycobacterium tuberculosis H37Rv|Rv3423c|alr
VKRFWENVGKPNDTTDGRGTTSLAMTPISQTPGLLAEAMVDLGAIEHNVRVLREHAGHAQLMAVVKADGYGH
GATRVAQTALGAGAAELGVATVDEALALRADGITAPVLAWLHPPGIDFGPALLADVQVAVSSLRQLDELLHA
VRRTGRTATVTVKVDTGLNRNGVGPAQFPAMLTALRQAMAEDAVRLRGLMSHMVYADKPDDSINDVQAQRFT
AFLAQAREQGVRFEVAHLSNSSATMARPDLTFDLVRPGIAVYGLSPVPALGDMGLVPAMTVKCAVALVKSIR
AGEGVSYGHTWIAPRDTNLALLPIGYADGVFRSLGGRLEVLINGRRCPGVGRICMDQFMVDLGPGPLDVAEG
DEAILFGPGIRGEPTAQDWADLVGTIHYEVVTSPRGRITRTYREAENR
>alr_complex | chain A | 371 aa
LAEAMVDLGAIEHNVRVLREHAGHAQLMAVVKADGYGHGATRVAQTALGAGAAELGVATVDEALALRADGIT
APVLAWLHPPGIDFGPALLADVQVAVSSLRQLDELLHAVRRTGRTATVTVKVDTGLNRNGVGPAQFPAMLTA
LRQAMAEDAVRLRGLMSHMVYADKPDDSINDVQAQRFTAFLAQAREQGVRFEVAHLSNSSATMARPDLTFDL
VRPGIAVYGLSPVPALGDMGLVPAMTVKCAVALVKSIRAGEGVSYGHTWIAPRDTNLALLPIGYADGVFRSL
GGRLEVLINGRRCPGVGRICMDQFMVDLGPGPLDVAEGDEAILFGPGIRGEPTAQDWADLVGTIHYEVVTSP
RGRITRTYREA

View file

@ -1,125 +0,0 @@
#!/usr/bin/env python3
# useful links
#https://towardsdatascience.com/pairwise-sequence-alignment-using-biopython-d1a9d0ba861f
#https://www.biostars.org/p/265338/
from Bio import SeqIO
from Bio import pairwise2
from Bio.pairwise2 import format_alignment
import re
import os
#%%
os.chdir('/home/tanu/git/LSHTM_analysis/scripts/examples')
#%%
def myalign(ref_seq, pdb_seq):
myalign_dict = {}
alignments = pairwise2.align.globalxx(ref_seq, pdb_seq)
#alignments = pairwise2.align.localxx(ref, struct)
match = []
for a, b in zip(alignments[0][0], alignments[0][1]):
if a == b:
match.append('|')
else:
match.append(' ')
#print(match)
print(alignments[0][0])
print("".join(match))
print(alignments[0][1])
result_align = alignments[0][1]
#print(result_align)
print('===============================================================\n')
# update dict
myalign_dict.update({'aligned_fasta': result_align})
aa_regex = '\w'
m = re.search(aa_regex, result_align)
#m = my_match.span()
offset = m.start()
offset_end = m.end()
print('start of match:', offset
, '\nend of match:', offset_end)
print('===============================================================\n')
# update dict
myalign_dict.update({'start_match' : offset})
myalign_dict.update({'end_match' : offset_end})
return myalign_dict
def main():
"""
read file containing reference and pdb_sequence to align
"""
my_dict = {}
align_fastas_to_align = open('align_fastas.txt', 'r')
for record in SeqIO.parse(align_fastas_to_align,"fasta"):
myid = record.id
seq = str(record.seq)
my_dict.update({myid : seq})
my_keys = list(my_dict.keys())
my_ref_seq = my_dict[my_keys[0]] # ref_seq
my_pdb_seq = my_dict[my_keys[1]] # pdb_seq
fasta_alignment = myalign(my_ref_seq, my_pdb_seq)
print('this is my result:', fasta_alignment)
if __name__ == '__main__':
main()
#%% debug: individually
my_dict = {}
align_fastas_to_align = open('align_fastas.txt', 'r')
for record in SeqIO.parse(align_fastas_to_align,"fasta"):
myid =record.id
seq=str(record.seq)
#print(myid, seq)
my_dict.update({myid: seq})
print(my_dict)
print(my_dict.keys())
my_keys = list(my_dict.keys())
alignments = pairwise2.align.globalxx(my_dict[my_keys[0]], my_dict[my_keys[1]])
match = []
for a, b in zip(alignments[0][0], alignments[0][1]):
if a == b:
match.append('|')
else:
match.append(' ')
#print(match)
print(alignments[0][0])
print("".join(match))
print(alignments[0][1])
result_align = alignments[0][1]
#print(result_align)
print('===============================================================\n')
#offset = ''
aa_regex = '\w'
m = re.search(aa_regex, result_align)
#m = my_match.span()
offset = m.start()
offset_end = m.end()
print('start of match:', offset, '\nend of match:', offset_end)
print('===============================================================\n')

File diff suppressed because it is too large Load diff

View file

@ -1,52 +0,0 @@
#!/usr/bin/python3
#=======================================================================
# TASK: select specified chains from the structure & save a cropped structure with
# the selected chains. Useful for dimer, etc modelling.
# link for saving each chain as a separate file
# https://stackoverflow.com/questions/11685716/how-to-extract-chains-from-a-structure-file
#=======================================================================
#%%
import os, sys
from Bio.PDB import PDBParser, PDBIO, Select
#%% homdir and curr dir and local imports
homedir = os.path.expanduser('~')
# set working dir
os.getcwd()
os.chdir(homedir + '/git/Data/ethambutol/input/')
os.getcwd()
#%%
# Select() Method to return True for every chain in 'chains'
class ChainExtract(Select):
def __init__(self, chain):
self.chain = chain
def accept_chain(self, chain):
#print dir(chain)
if chain.id in self.chain:
return 1
else:
return 0
def main():
p = PDBParser(PERMISSIVE=1)
structure = p.get_structure("3byw", "3byw.pdb")
my_chains = ['G', 'H'] # specify selected chains
c_names = ''.join(my_chains)
pdb_chains_file = 'pdb_crop_' + c_names + '.pdb'
io = PDBIO()
io.set_structure(structure)
#io.save(structure.get_id() + "_crop.structure", ChainExtract(my_chains))
io.save(pdb_chains_file, ChainExtract(my_chains))
if __name__ == '__main__':
main()
#%%
# test
#my_chains = ['G', 'H'] # specify selected chains
#foo = ''.join(my_chains) # to append to file name
#pdb_chains_file = '_{}.pdb'.format(my_chains)

View file

@ -1,49 +0,0 @@
#!/usr/bin/python3
#=======================================================================
# TASK: extract chain from pdb and save each chain as a separate file
# link for saving each chain as a separate file
# https://stackoverflow.com/questions/11685716/how-to-extract-chains-from-a-pdb-file
# command line args
# https://stackoverflow.com/questions/15753701/how-can-i-pass-a-list-as-a-command-line-argument-with-argparse
#=======================================================================
#%%
import os, sys
from Bio.PDB import Select, PDBIO
from Bio.PDB.PDBParser import PDBParser
#%% homdir and curr dir and local imports
homedir = os.path.expanduser('~')
# set working dir
os.getcwd()
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
os.getcwd()
#%%
class ChainSelect(Select):
def __init__(self, chain):
self.chain = chain
def accept_chain(self, chain):
if chain.get_id() == self.chain:
return 1
else:
return 0
def main():
chains = ['A','B','C','F']
p = PDBParser(PERMISSIVE=1)
#structure = p.get_structure(pdb_file, pdb_file)
structure = p.get_structure('/home/tanu/git/Data/ethambutol/input/3byw', '/home/tanu/git/Data/ethambutol/input/3byw.pdb')
#print('STRUCTURE:', structure.get_id())
# pdb_filename = print()
for chain in chains:
pdb_chain_file = 'pdb_file_chain_{}.pdb'.format(chain)
io = PDBIO()
io.set_structure(structure)
io.save('{}'.format(pdb_chain_file), ChainSelect(chain))
# If run from command line...
if __name__ == "__main__":
main()

View file

@ -1,45 +0,0 @@
#!/usr/bin/env python
import os
from biopandas.pdb import PandasPdb
#%%
homedir = os.path.expanduser('~')
os.chdir(homedir + '/git/LSHTM_analysis/scripts/examples')
#%%
file_list = ['7bvf_b.pdb', 'pnca_complex.pdb', 'alr_complex.pdb']
file_list = ['7bvf_b.pdb']
#file_list = ['pnca_complex.pdb']
file_list = ['alr_complex.pdb']
BORING_LIGANDS = ["HOH","CA","SO4","IOD","NA","CL","GOL","PO4"]
#%% df with list
ligands_dict = {}
for pdb_id in file_list:
ppdb = PandasPdb()
pdb_file = ppdb.read_pdb(pdb_id)
het = pdb_file.df['HETATM']
het_list = list(set(het['residue_name']))
ligands = [ l for l in het_list if l not in BORING_LIGANDS]
lig_dict = {pdb_id:ligands}
#lig_dict = {pdb_id:het_list} # include BORING_LIGANDS
ligands_dict.update(lig_dict)
print(ligands_dict)
print('pdb_code:', pdb_file.code) # works only in case of valid pdb
print('pdb_code:', pdb_file.pdb_path) #works for bespoke pdb but prints the full path
print('pdb_code:', os.path.basename(pdb_file.pdb_path)) # prints only the last part i.e filename
#%% test with one
ppdb = PandasPdb()
pdb_file = ppdb.read_pdb('7bvf_b.pdb')
het = pdb_file.df['HETATM']
het_list = list(set(het['residue_name']))
print(het_list)
ligands = [ l for l in het_list if l not in BORING_LIGANDS]
print(ligands)
#%% extract last part from file path
print(os.path.basename(homedir + '/git/LSHTM_analysis/scripts/examples'))
print(os.path.basename('alr_complex.pdb'))
foo = os.path.basename(pdb_file.pdb_path)
print(foo)

View file

@ -1,81 +0,0 @@
#!/usr/bin/env python
import os
from Bio.PDB import *
from biopandas.pdb import PandasPdb
from collections import defaultdict, OrderedDict
import pandas as pd
from functools import reduce
#%% see verison of pandas
#print(pd.__version__)
#%%
homedir = os.path.expanduser('~')
os.chdir(homedir + '/git/LSHTM_analysis/scripts/examples')
# link
#https://www.pythonprogramming.in/pandas-count-distinct-values-of-one-column-depend-on-another-column.html
#https://datascience.stackexchange.com/questions/32328/export-pandas-to-dictionary-by-combining-multiple-row-values
# 3 way merge
#https://stackoverflow.com/questions/23668427/pandas-three-way-joining-multiple-dataframes-on-columns
#https://stackoverflow.com/questions/52223045/merge-multiple-dataframes-based-on-a-common-column
#%% Read data
file_list = ['7bvf_b.pdb']
file_list = ['3byw.pdb']
#file_list = ['7bvf_b.pdb', 'pnca_complex.pdb', '3byw']
#%%
for pdb in file_list:
print(pdb)
p = PDBParser()
structure = p.get_structure(pdb, pdb)
for model in structure:
for chain in model:
for residue in chain:
for atom in residue:
#print(atom)
#%% biopandas
pdb_dict = {}
for pdb_id in file_list:
ppdb = PandasPdb()
pdb_file = ppdb.read_pdb(pdb_id)
#dir(pdb_file)
atm_df = pdb_file.df['ATOM']
#print('column names:', atm_df.columns)
pdb_chains = list(set(atm_df['chain_id']))
print('pdb chains:', pdb_chains)
total_chains = len(pdb_chains)
print('total no. of chains:', total_chains)
chain_info = {}
#atm_df_s = atm_df.sort_values(by=['atom_number', 'chain_id', 'residue_number'])
c_start = atm_df.groupby('chain_id').residue_number.min()
print(c_start)
c_start_df = pd.DataFrame({'chain_id': c_start.index, 'start_res': c_start.values})
c_end = atm_df.groupby('chain_id').residue_number.max()
print(c_end)
c_end_df = pd.DataFrame({'chain_id': c_end.index, 'end_res': c_end.values})
c_length = atm_df.groupby('chain_id').residue_number.nunique()
print(c_length)
c_length_df = pd.DataFrame({'chain_id': c_length.index, 'chain_len': c_length.values})
# combine 3 series into and assign 'chain_id' as a column
# using rlambda function works well (as it should work with whatever number of dataframes you want to merge)
# using pd.concat creates extra chain id cols
c_df = reduce(lambda left,right: pd.merge(left,right, on = 'chain_id'), [c_start_df, c_end_df, c_length_df])
# convert df to dict with 'chain_id' as key and columns as list of values
chain_dict = c_df.set_index('chain_id').T.to_dict('list')
print(chain_dict)
#%% Idea
#protein_name: total_chains: 8, total ligands/hetatm = 3
#df of chain details
#chain start_res end_res len_chain
#pdb tools script separate one chain
# remove water and

File diff suppressed because it is too large Load diff

60
scripts/tidy_split.py Normal file
View file

@ -0,0 +1,60 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
'''
Created on Tue Aug 6 12:56:03 2019
@author: tanu
'''
#=======================================================================
#%% load libraries
import os, sys
import pandas as pd
#import numpy as np
#=======================================================================
#%% homdir and curr dir and local imports
#homedir = os.path.expanduser('~')
# set working dir
#os.getcwd()
#os.chdir(homedir + '/git/LSHTM_analysis/scripts')
#os.getcwd()
#%%=====================================================================
# define the split function
def tidy_split(df, column, sep = '|', keep = False):
'''
Split the values of a column and expand so the new DataFrame has one split
value per row. Filters rows where the column is missing.
Params
------
df : pandas.DataFrame
dataframe with the column to split and expand
column : str
the column to split and expand
sep : str
the string used to split the column's values
keep : bool
whether to retain the presplit value as it's own row
Returns
-------
pandas.DataFrame
Returns a dataframe with the same columns as `df`.
'''
indexes = list()
new_values = list()
#df = df.dropna(subset=[column])#!!!!-----see this incase you need to uncomment based on use case
for i, presplit in enumerate(df[column].astype(str)):
values = presplit.split(sep)
if keep and len(values) > 1:
indexes.append(i)
new_values.append(presplit)
for value in values:
indexes.append(i)
new_values.append(value)
new_df = df.iloc[indexes, :].copy()
new_df[column] = new_values
return new_df
#%%=====================================================================
#end of tidy_split()