#!/usr/bin/env python3 # -*- coding: utf-8 -*- ''' Created on Tue Aug 6 12:56:03 2019 @author: tanu ''' # FIXME: include error checking to enure you only # concentrate on positions that have structural info? # FIXME: import dirs.py to get the basic dir paths available #======================================================================= # TASK: extract ALL matched mutations from GWAS data # Input data file has the following format: each row = unique sample id # id,country,lineage,sublineage,drtype,drug,dr_muts_col,other_muts_col... # 0,sampleID,USA,lineage2,lineage2.2.1,Drug-resistant,0.0,WT,gene_matchPOS; pncA_c.POS... # where multiple mutations and multiple mutation types are separated by ';'. # We are interested in the protein coding region i.e mutation with the_'p.' format. # This script splits the mutations on the ';' and extracts protein coding muts only # where each row is a separate mutation # sample ids AND mutations are NOT unique, but the COMBINATION (sample id + mutation) = unique # output files: all lower case # 1) _gwas.csv # 2) _common_ids.csv # 3) _ambiguous_muts.csv # 4) _mcsm_formatted_snps.csv # 5) _metadata_poscounts.csv # 6) _metadata.csv # 7) _all_muts_msa.csv # 8) _mutational_positons.csv #------------ # NOTE #----------- # drtype is renamed to 'resistance' in the 35k dataset # all colnames in the ouput files lowercase #------------- # requires #------------- #reference_dict.py #tidy_split.py #======================================================================= #%% load libraries import os, sys import re import pandas as pd import numpy as np import argparse #======================================================================= #%% dir and local imports homedir = os.path.expanduser('~') # set working dir os.getcwd() os.chdir(homedir + '/git/LSHTM_analysis/scripts') os.getcwd() #======================================================================= #%% command line args arg_parser = argparse.ArgumentParser() arg_parser.add_argument('-d', '--drug', help='drug name (case sensitive)', default = None) arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = None) arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data') arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + + input') arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + + output') arg_parser.add_argument('-m', '--make_dirs', help = 'Make dir for input and output', action='store_true') arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode') args = arg_parser.parse_args() #======================================================================= #%% variable assignment: input and output paths & filenames drug = args.drug gene = args.gene datadir = args.datadir indir = args.input_dir outdir = args.output_dir make_dirs = args.make_dirs #drug = 'streptomycin' #gene = 'gid' #%% input and output dirs and files #======= # dirs #======= if not datadir: datadir = homedir + '/' + 'git/Data' if not indir: indir = datadir + '/' + drug + '/input' if not outdir: outdir = datadir + '/' + drug + '/output' if make_dirs: print('make_dirs is turned on, creating data dir:', datadir) try: os.makedirs(datadir, exist_ok = True) print("Directory '%s' created successfully" %datadir) except OSError as error: print("Directory '%s' can not be created") print('make_dirs is turned on, creating indir:', indir) try: os.makedirs(indir, exist_ok = True) print("Directory '%s' created successfully" %indir) except OSError as error: print("Directory '%s' can not be created") print('make_dirs is turned on, creating outdir:', outdir) try: os.makedirs(outdir, exist_ok = True) print("Directory '%s' created successfully" %outdir) except OSError as error: print("Directory '%s' can not be created") # handle missing dirs here if not os.path.isdir(datadir): print('ERROR: Data directory does not exist:', datadir , '\nPlease create and ensure gwas data is present and then rerun\nelse specify cmd option --make_dirs') sys.exit() if not os.path.isdir(indir): print('ERROR: Input directory does not exist:', indir , '\nPlease either create or specify indir and rerun\nelse specify cmd option --make_dirs') sys.exit() if not os.path.isdir(outdir): print('ERROR: Output directory does not exist:', outdir , '\nPlease create or specify outdir and rerun\nelse specify cmd option --make_dirs') sys.exit() # Requires from reference_dict import my_aa_dict # CHECK DIR STRUC THERE! from tidy_split import tidy_split #======================================================================= gene_match = gene + '_p.' print('mut pattern for gene', gene, ':', gene_match) nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}' print('nsSNP for gene', gene, ':', nssnp_match) wt_regex = gene_match.lower()+'([A-Za-z]{3})' print('wt regex:', wt_regex) mut_regex = r'[0-9]+(\w{3})$' print('mt regex:', mut_regex) pos_regex = r'([0-9]+)' print('position regex:', pos_regex) # building cols to extract dr_muts_col = 'dr_mutations_' + drug other_muts_col = 'other_mutations_' + drug resistance_col = 'drtype' print('Extracting columns based on variables:\n' , drug , '\n' , dr_muts_col , '\n' , other_muts_col , '\n' , resistance_col , '\n===============================================================') #======================================================================= #======= # input #======= #in_filename_master_master = 'original_tanushree_data_v2.csv' #19k in_filename_master = 'mtb_gwas_meta_v6.csv' #35k infile_master = datadir + '/' + in_filename_master print('Input file: ', infile_master , '\n============================================================') #======= # output #======= # several output files: in respective sections at the time of outputting files print('Output filename: in the respective sections' , '\nOutput path: ', outdir , '\n=============================================================') #%%end of variable assignment for input and output files #======================================================================= #%% Read input file master_data = pd.read_csv(infile_master, sep = ',') # column names #list(master_data.columns) # extract elevant columns to extract from meta data related to the drug if in_filename_master == 'original_tanushree_data_v2.csv': meta_data = master_data[['id' , 'country' , 'lineage' , 'sublineage' , 'drtype' , drug , dr_muts_col , other_muts_col]] else: core_cols = ['id' , 'sample' #, 'patient_id' #, 'strain' , 'lineage' , 'sublineage' #, 'country' , 'country_code' , 'geographic_source' #, 'region' #, 'location' #, 'host_body_site' #, 'environment_material' #, 'host_status' #, 'host_sex' #, 'submitted_host_sex' #, 'hiv_status' #, 'HIV_status' #, 'tissue_type' #, 'isolation_source' , resistance_col] variable_based_cols = [drug , dr_muts_col , other_muts_col] cols_to_extract = core_cols + variable_based_cols print('Extracting', len(cols_to_extract), 'columns from master data') meta_data = master_data[cols_to_extract] del(master_data, variable_based_cols, cols_to_extract) print('Extracted meta data from filename:', in_filename_master , '\nDim:', meta_data.shape) # checks and results total_samples = meta_data['id'].nunique() print('RESULT: Total samples:', total_samples , '\n===========================================================') # counts NA per column meta_data.isna().sum() print('No. of NAs/column:' + '\n', meta_data.isna().sum() , '\n===========================================================\n') #%% Write check file check_file = outdir + '/' + gene.lower() + '_gwas.csv' meta_data.to_csv(check_file, index = False) print('\n----------------------------------' , '\nWriting subsetted gwas data:', gene , '\n----------------------------------' , '\nFile', check_file , '\nDim:', meta_data.shape) # equivalent of table in R # drug counts: complete samples for OR calcs meta_data[drug].value_counts() print('===========================================================\n' , 'RESULT: No. of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts() , '\n===========================================================\n' , 'RESULT: Percentage of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts(normalize = True)*100 , '\n===========================================================') #%% #!!!!!!!!!! foo = meta_data[dr_muts_col].value_counts() foo = foo.reset_index(name = 'values') foo.columns = [dr_muts_col, 'dr_muts_count'] #171 foo_count = foo.loc[foo[dr_muts_col].str.contains('del', na = False, regex = True, case = False) ] bar = meta_data[other_muts_col].value_counts() bar = bar.reset_index(name = 'values') bar.columns = [other_muts_col, 'other_muts_count'] #64 bar_count = bar.loc[bar[other_muts_col].str.contains('del', na = False, regex = True, case = False)] tot = len(foo_count) + len(bar_count) n_del = tot/len(meta_data) n_del*100 #0.6 #============================ foo2 = meta_data[dr_muts_col].value_counts() foo2 = foo2.reset_index(name = 'values') foo2.columns = [dr_muts_col, 'dr_muts_count'] #171 foo2_count = foo2.loc[foo2[dr_muts_col].str.contains('ins', na = False, regex = True, case = False) ] bar2 = meta_data[other_muts_col].value_counts() bar2 = bar2.reset_index(name = 'values') bar2.columns = [other_muts_col, 'pther_muts_count'] #64 bar2_count = bar2.loc[bar2[other_muts_col].str.contains('ins', na = False, regex = True, case = False)] tot2 = len(foo2_count) + len(bar2_count) n_ins = tot2/len(meta_data) n_ins*100 #0.5 #========= del_ins_tot = n_del*100 + n_ins*100 #========== # stop codons baz_count = foo.loc[foo[dr_muts_col].str.contains('\*', na = False, regex = True, case = False) ] baz_count2 = bar.loc[bar[other_muts_col].str.contains('\*', na = False, regex = True, case = False)] tot3 = len(baz_count) + len(baz_count2) n_stop = tot3/len(meta_data) n_stop*100 #0.11 all_tot = n_del*100 + n_ins*100 + n_stop*100 #1.2