#!/usr/bin/env python3 # -*- coding: utf-8 -*- ''' Created on Tue Aug 6 12:56:03 2019 @author: tanu ''' # FIXME: include error checking to enure you only # concentrate on positions that have structural info? # FIXME: import dirs.py to get the basic dir paths available #======================================================================= # TASK: #======================================================================= #%% load libraries import os, sys import re import pandas as pd import numpy as np import argparse #======================================================================= #%% homdir and curr dir and local imports homedir = os.path.expanduser('~') # set working dir os.getcwd() os.chdir(homedir + '/git/LSHTM_analysis/scripts') os.getcwd() # import aa dict #from reference_dict import my_aa_dict # CHECK DIR STRUC THERE! #from tidy_split import tidy_split #======================================================================= #%% command line args arg_parser = argparse.ArgumentParser() arg_parser.add_argument('-d', '--drug', help='drug name', default = None) arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = None) args = arg_parser.parse_args() #======================================================================= #%% variable assignment: input and output paths & filenames #drug = args.drug #gene = args.gene drug = 'pyrazinamide' gene = 'pncA' gene_match = gene + '_p.' print('mut pattern for gene', gene, ':', gene_match) nssnp_match = gene_match +'[A-Z]{3}[0-9]+[A-Z]{3}' print('nsSNP for gene', gene, ':', nssnp_match) wt_regex = gene_match.lower()+'(\w{3})' print('wt regex:', wt_regex) mut_regex = r'\d+(\w{3})$' print('mt regex:', mut_regex) pos_regex = r'(\d+)' print('position regex:', pos_regex) # building cols to extract dr_muts_col = 'dr_mutations_' + drug other_muts_col = 'other_mutations_' + drug dr_type = "resistance" print('Extracting columns based on variables:\n' , drug , '\n' , dr_type , '\n' , dr_muts_col , '\n' , other_muts_col , '\n===============================================================') #======================================================================= #%% input and output dirs and files #======= # dirs #======= datadir = homedir + '/' + 'git/Data' indir = datadir + '/' + drug + '/' + 'input' outdir = datadir + '/' + drug + '/' + 'output' #======= # input #======= #in_filename_master_master = 'original_tanushree_data_v2.csv' #19k in_filename_v2 = 'original_tanushree_data_v2.csv' #19k infile_master_v2 = datadir + '/' + in_filename_v2 print('Input file v2: ', infile_master_v2 , '\n============================================================') in_filename_v3 = 'mtb_gwas_meta_v3.csv' #33k infile_master_v3 = datadir + '/' + in_filename_v3 print('Input file v3: ', infile_master_v3 , '\n============================================================') in_filename_v4 = 'mtb_gwas_meta_v4.csv' #34k infile_master_v4 = datadir + '/' + in_filename_v4 print('Input file v4: ', infile_master_v4 , '\n============================================================') in_filename_v5 = 'mtb_gwas_meta_v5.csv' #34k infile_master_v5 = datadir + '/' + in_filename_v5 print('Input file v4: ', infile_master_v5 , '\n============================================================') #======= # output #======= # several output files: in respective sections at the time of outputting files print('Output filename: in the respective sections' , '\nOutput path: ', outdir , '\n=============================================================') #%%end of variable assignment for input and output files #======================================================================= #%% Read input file master_data_v2 = pd.read_csv(infile_master_v2, sep = ',', dtype = 'unicode') # ascii master_data_v3 = pd.read_csv(infile_master_v3, sep = ',', dtype = 'unicode') master_data_v4 = pd.read_csv(infile_master_v4, sep = ',', dtype = 'unicode') master_data_v5 = pd.read_csv(infile_master_v5, sep = ',', dtype = 'unicode') #DtypeWarning: Columns (48) have mixed types.Specify dtype option on import or set low_memory=False. # interactivity=interactivity, compiler=compiler, result=result) #========== # na_check #========== #================================================================== v2_na = master_data_v2.isna().sum() v2_na.name = "v2_na_count" v2_na = v2_na.to_frame() v2_na['v2_na_percent'] = master_data_v2.isna().mean().round(4)*100 master_data_v2['drtype'].value_counts() master_data_v2['drtype'].value_counts().sum() == len(master_data_v2) v2 = master_data_v2[['id' , 'country' , 'lineage' , 'sublineage' , 'drtype' , drug , dr_muts_col , other_muts_col]] v2.isna().sum() print('complete samples v2:', v2['id'].nunique() - v2[drug].isna().sum()) #================================================================== v3_na = master_data_v3.isna().sum() v3_na.name = "v3_na_count" v3_na = v3_na.to_frame() v3_na['v3_na_percent'] = master_data_v3.isna().mean().round(4)*100 master_data_v3['resistance'].value_counts() master_data_v3['resistance'].value_counts().sum() == len(master_data_v3) v3 = master_data_v3[['id' , 'country' , 'lineage' , 'sublineage' , 'resistance' , drug , dr_muts_col , other_muts_col]] v3.isna().sum() print('complete samples v3:', v3['id'].nunique() - v3[drug].isna().sum()) #================================================================== v4_na = master_data_v4.isna().sum() v4_na.name = "v4_na_count" v4_na = v4_na.to_frame() v4_na['v4_na_percent'] = master_data_v4.isna().mean().round(4)*100 v4 = master_data_v4[['id' , 'country' , 'lineage' , 'sublineage' , drug , dr_muts_col , other_muts_col]] v4.isna().sum() print('complete samples v4:', v4['id'].nunique() - v4[drug].isna().sum()) #================================================================== v5_na = master_data_v5.isna().sum() v5_na.name = "v5_na_count" v5_na = v5_na.to_frame() v5_na['v4_na_percent'] = master_data_v5.isna().mean().round(4)*100 v5 = master_data_v5[['id' , 'country' , 'lineage' , 'sublineage' , drug , dr_muts_col , other_muts_col]] v5.isna().sum() print('complete samples v5:', v5['id'].nunique() - v5[drug].isna().sum()) #==================================================================== # checking ids id_check1 = master_data_v2['id'].isin(master_data_v3['id']).sum() print('No. of 19k dataset (v1) ids in 33k dataset (v2):',id_check1) id_check2 = master_data_v2['id'].isin(master_data_v4['id']).sum() print('No. of 19k dataset (v1) ids in 34k dataset (v4):',id_check2) id_check3 = master_data_v4['id'].isin(master_data_v2['id']).sum() print('No. of 19k dataset (v1) ids in 34k dataset (v4):',id_check3) id_check4 = master_data_v3['sample_accession'].isin(master_data_v4['sample_accession']).sum() print('No. of 33k dataset (v3) ids in 34k dataset (v3):',id_check4) id_check5 = master_data_v4['sample_accession'].isin(master_data_v3['sample_accession']).sum() print('No. of 34k dataset (v4) ids in 33k dataset (v3):', id_check5) master_data_v3['sample_accession'].equals(master_data_v3['accession']) master_data_v3['sample_accession'].isin(master_data_v3['accession']).sum() master_data_v3['accession'].isin(master_data_v3['sample_accession']).sum() master_data_v4['sample_accession'].equals(master_data_v4['accession']) master_data_v4['sample_accession'].isin(master_data_v4['accession']).sum() master_data_v4['accession'].isin(master_data_v4['sample_accession']).sum() #=================================================================== #==================================================================== #which v3 cols are NOT IN V4 master_data_v3.columns[~master_data_v3.columns.isin(master_data_v4.columns)] # which v4 cols ARE NOT in v3 master_data_v4.columns[~master_data_v4.columns.isin(master_data_v3.columns)] # job: I need resistance and region in v4 data from v3 # find mergig cols np.intersect1d(master_data_v3.columns, master_data_v4.columns) master_data_v3['id'].nunique() == len(master_data_v3) master_data_v3['sample_accession'].nunique() == len(master_data_v3) master_data_v3['accession'].nunique() == len(master_data_v3) master_data_v3['run_accession'].nunique() == len(master_data_v3) master_data_v4['id'].nunique() == len(master_data_v4) master_data_v4['sample_accession'].nunique() == len(master_data_v4) master_data_v4['accession'].nunique() == len(master_data_v4) master_data_v4['run_accession'].nunique() == len(master_data_v4) c_v4 = master_data_v4[['id', 'sample', 'sample_accession', 'run_accession', 'accession' , 'location', 'country', 'geographic_source', 'country_code']] c_v4.isna().sum() c_v4_ids = master_data_v4[['id', 'sample', 'sample_accession', 'run_accession', 'accession']] c_v4_ids.isna().sum() c_v4_ids.eq(c_v4_ids.iloc[:, 0], axis=0) c_v3 = master_data_v3[['id', 'sample_accession', 'run_accession','accession' , 'location', 'country', 'geographic_source', 'region']] c_v3.isna().sum() c_v3_ids = master_data_v3[['id', 'sample_accession', 'run_accession', 'accession']] c_v3_ids.isna().sum() c_v3_ids.eq(c_v3_ids.iloc[:, 0], axis=0) # comment: id, sample, sample_accession and run_accession seem to have no na master_data_v4[drug].isna().sum() #%% Write file: mCSM muts #%% Write file: gene_metadata (i.e gene_LF1) # where each row has UNIQUE mutations NOT unique sample ids out_filename_metadata = gene.lower() + '_metadata.csv' outfile_metadata = outdir + '/' + out_filename_metadata print('Writing file: LF formatted data' , '\nFile:', outfile_metadata , '\n============================================================') gene_LF1.to_csv(outfile_metadata, header = True, index = False) print('Finished writing:', outfile_metadata , '\nNo. of rows:', len(gene_LF1) , '\nNo. of cols:', len(gene_LF1.columns) , '\n=============================================================') del(out_filename_metadata) #%% write file: mCSM style but with repitions for MSA and logo plots print('Writing file: mCSM style muts for msa', '\nFile:', outfile_msa, '\nmutation format (SNP): {WT}{MUT}', '\nNo.of lines of msa:', len(all_muts_msa)) all_muts_msa_sorted.to_csv(outfile_msa, header = False, index = False) print('Finished writing:', outfile_msa , '\nNo. of rows:', len(all_muts_msa) , '\nNo. of cols:', len(all_muts_msa.columns) , '\n=============================================================') del(out_filename_msa)