#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Wed Jun 10 11:13:49 2020 @author: tanu """ #======================================================================= #%% useful links #https://chrisalbon.com/python/data_wrangling/pandas_join_merge_dataframe/ #https://kanoki.org/2019/11/12/how-to-use-regex-in-pandas/ #https://stackoverflow.com/questions/40348541/pandas-diff-with-string #======================================================================= import os, sys import pandas as pd import numpy as np import re import argparse homedir = os.path.expanduser('~') os.chdir(homedir + '/git/LSHTM_analysis/scripts') #======================================================================= #%% command line args arg_parser = argparse.ArgumentParser() arg_parser.add_argument('-d', '--drug', help = 'drug name', default = None) arg_parser.add_argument('-g', '--gene', help = 'gene name (case sensitive)', default = None) arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data') arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + + input') arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + + output') # FIXME: remove defaults #arg_parser.add_argument('-sc', '--start_coord', help = 'start of coding region (cds) of gene', default = None, type = int) # pnca cds #arg_parser.add_argument('-ec', '--end_coord', help = 'end of coding region (cds) of gene', default = None, type = int) # pnca cds arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode') args = arg_parser.parse_args() #======================================================================= #%% variables #gene = 'pncA' #drug = 'pyrazinamide' #start_cds = 2288681 #end_cds = 2289241 #%%===================================================================== # Command line options gene = args.gene drug = args.drug gene_match = gene + '_p.' datadir = args.datadir indir = args.input_dir outdir = args.output_dir #start_cds = args.start_coord #end_cds = args.end_coord #%%======================================================================= #============== # directories #============== if not datadir: datadir = homedir + '/' + 'git/Data' if not indir: indir = datadir + '/' + drug + '/input' if not outdir: outdir = datadir + '/' + drug + '/output' #======= # input #======= gene_info_filename = 'ns'+ gene.lower()+ '_snp_info.txt' #gene_info_filename = 'ns'+ gene.lower()+ '_snp_info.csv' gene_info = indir + '/' + gene_info_filename print('gene info file: ', gene_info , '\n============================================================') #======= # output #======= snpinfo_formatted_filename = 'ns' + gene.lower() + '_snp_info_f.csv' snpinfo_formatted = outdir + '/' + snpinfo_formatted_filename print('Output file: ', snpinfo_formatted , '\n============================================================') #%% read files: preformatted using bash info_df2 = pd.read_csv(gene_info, sep = '\t', header = 0) #447, 10 #%% extract mut info into three cols df_ncols = len(info_df2.columns) print('Dim of df to add cols to:', df_ncols) # column names already present, wrap this in a if and perform sanity check ncols_add = 0 if not 'wild_type' in info_df2.columns: print('Extracting and adding column: wild_type' , '\n===============================================================') info_df2['wild_type'] = info_df2['mut_info_f1'].str.extract('(\w{1})>') ncols_add+=1 if not 'position' in info_df2.columns: print('Extracting and adding column: position' , '\n===============================================================') info_df2['position'] = info_df2['mut_info_f1'].str.extract('(\d+)') #info_df2['position'] = info_df2[:,'mut_info_f1'].str.extract('(\d+)') ncols_add+=1 if not 'mutant_type' in info_df2.columns: print('Extracting and adding column: mutant_type' , '\n================================================================') info_df2['mutant_type'] = info_df2['mut_info_f1'].str.extract('>\d+(\w{1})') ncols_add+=1 if not 'mutationinformation' in info_df2.columns: print('combining to create column: mutationinformation' , '\n===============================================================') info_df2['mutationinformation'] = info_df2['wild_type'] + info_df2['position'] + info_df2['mutant_type'] ncols_add+=1 print('No. of cols added:', ncols_add) if len(info_df2.columns) == df_ncols + ncols_add: print('PASS: mcsm style muts added to df' , '\n===============================================================') else: print('FAIL: No. of cols mismatch' ,'\nOriginal length:', df_ncols , '\nExpected no. of cols:', df_ncols + ncols_add , '\nGot:', len(info_df2.columns)) sys.exit() del(df_ncols, ncols_add) #%% now adding mutation style = _p.abc1cde info_df2['mutation'] = gene.lower() + '_' + info_df2['mut_info_f2'].astype(str) # convert to lowercase info_df2['mutation'] = info_df2['mutation'].str.lower() # quick sanity check check = info_df2['mutation'].value_counts().value_counts() == info_df2['mut_info_f2'].value_counts().value_counts() if check.all(): print('PASS: added column "mutation" containing mutation format: _p.abc1cde') else: print('FAIL: could not add "mutation" column!') sys.exit() #%% removing unnecessary columns cols_to_remove = ['chromosome_text', 'mut_region', 'symbol'] info_df2_formatted = info_df2.drop(cols_to_remove, axis = 1) if len(info_df2_formatted.columns) == info_df2.shape[1] - len(cols_to_remove): print('PASS: columns successfully dropped and dim match') else: print('FAIL: could not drop columns!') sys.exit() #%% write file print('\n=====================================================================' , '\nWriting output file:\n', info_df2_formatted , '\nNo. of rows:', len(info_df2_formatted) , '\nNo. of cols:', len(info_df2_formatted.columns)) info_df2_formatted.to_csv(snpinfo_formatted , index = False) #%% diff b/w allele0 and 1: or_df #https://stackoverflow.com/questions/40348541/pandas-diff-with-string #df = or_df.iloc[[5, 15, 17, 19, 34]] #df[['alt_allele0','ref_allele1']].ne(df[['alt_allele0','ref_allele1']].shift()).any(axis=1).astype(int) #df[['alt_allele0','ref_allele1']].ne(df[['alt_allele0','ref_allele1']].shift()).any(axis=1).astype(int)