diff --git a/scripts/data_extraction.py b/scripts/data_extraction.py index e0e656f..1cb703c 100755 --- a/scripts/data_extraction.py +++ b/scripts/data_extraction.py @@ -50,6 +50,7 @@ Created on Tue Aug 6 12:56:03 2019 #======================================================================= #%% load libraries import os, sys +import re import pandas as pd #import numpy as np import argparse @@ -72,12 +73,14 @@ arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', defau args = arg_parser.parse_args() #======================================================================= #%% variable assignment: input and output paths & filenames +drug = args.drug +gene = args.gene + #drug = 'pyrazinamide' #gene = 'pncA' -drug = args.drug -gene = args.gene gene_match = gene + '_p.' +nssnp_match = gene_match+'[A-Z]{3}[0-9]+[A-Z]{3}' # building cols to extract dr_muts_col = 'dr_mutations_' + drug @@ -93,14 +96,16 @@ print('Extracting columns based on variables:\n' #======================================================================= #%% input and output dirs and files #======= -# data dir +# dirs #======= datadir = homedir + '/' + 'git/Data' +indir = datadir + '/' + drug + '/' + 'input' +outdir = datadir + '/' + drug + '/' + 'output' #======= # input #======= -#in_filename = 'original_tanushree_data_v2.csv' #19k +in_filename = 'original_tanushree_data_v2.csv' #19k in_filename = 'mtb_gwas_meta_v3.csv' #33k infile = datadir + '/' + in_filename print('Input file: ', infile @@ -109,9 +114,7 @@ print('Input file: ', infile #======= # output #======= -# several output files -# output filenames in respective sections at the time of outputting files -outdir = datadir + '/' + drug + '/' + 'output' +# several output files: in respective sections at the time of outputting files print('Output filename: in the respective sections' , '\nOutput path: ', outdir , '\n=============================================================') @@ -126,15 +129,14 @@ master_data = pd.read_csv(infile, sep = ',') # extract elevant columns to extract from meta data related to the drug -#meta_data_ch = master_data[['id' -#, 'country' -#, 'lineage' -#, 'sublineage' -##, 'drtype' #19k only -#, 'resistance' -#, drug -#, dr_muts_col -#, other_muts_col]] +meta_data = master_data[['id' +, 'country' +, 'lineage' +, 'sublineage' +, 'drtype' #19k only +, drug +, dr_muts_col +, other_muts_col]] core_cols = ['id' @@ -160,6 +162,7 @@ variable_based_cols = [drug , other_muts_col] cols_to_extract = core_cols + variable_based_cols +print('Extracting', len(cols_to_extract), 'columns from master data') meta_data = master_data[cols_to_extract] @@ -174,18 +177,21 @@ print('RESULT: Total samples:', total_samples meta_data.isna().sum() print('No. of NAs/column:' + '\n', meta_data.isna().sum() , '\n===========================================================') - +# # glance -meta_data.head() +#meta_data.head() +#total_samples - NA pyrazinamide = ? +# 19K: 19265-6754 = 12511 +# 33K: 33681 - 23823 = 9858 # equivalent of table in R -# drug counts +# drug counts: complete samples for OR calcs meta_data[drug].value_counts() print('RESULT: Sus and Res samples:\n', meta_data[drug].value_counts() , '\n===========================================================') # clear variables -del(in_filename,infile) +del(in_filename, infile) #del(outdir) #%% # !!!IMPORTANT!!! sanity check: @@ -208,11 +214,10 @@ na_count = meta_data[dr_muts_col].isna().sum() if len(clean_df) == (total_samples - na_count): print('PASS: clean_df extracted: length is', len(clean_df) - , '\nNo.of NAs =', na_count, '/', total_samples + , '\nNo.of NAs in', dr_muts_col, '=', na_count, '/', total_samples , '\n==========================================================') else: - print('FAIL: dropping NA failed' - , '\n==========================================================') + sys.exit('FAIL: Could not drop NA') dr_gene_count = 0 wt = 0 @@ -238,7 +243,7 @@ print('Total matches of', gene_match, 'in dr_muts_col:', dr_gene_count) print('Total samples with > 1', gene_match, 'muts in dr_muts_col:', len(id2_dr) ) print('=================================================================') -del(i, id, wt, id2_dr, clean_df, na_count, count_gene_dr, count_wt) +del(clean_df, na_count, i, id, wt, id2_dr, count_gene_dr, count_wt) #======== # Second: counting mutations in dr_muts_col column @@ -258,6 +263,7 @@ if len(clean_df) == (total_samples - na_count): else: print('FAIL: dropping NA failed' , '\n=========================================================') +sys.exit() other_gene_count = 0 wt_other = 0 @@ -268,7 +274,7 @@ for i, id in enumerate(clean_df.id): # print (i, id) # id_other.append(id) # count_gene_other = clean_df[other_muts_col].iloc[i].count('gene_match') - count_gene_other = clean_df[other_muts_col].iloc[i].count(gene_match) + count_gene_other = clean_df[other_muts_col].iloc[i].count(gene_match) if count_gene_other > 0: id_other.append(id) if count_gene_other > 1: @@ -313,19 +319,25 @@ print('gene to extract:', gene_match ) dr_based_cols = [drug, dr_muts_col] cols_to_extract = core_cols + dr_based_cols +print('Extracting', len(cols_to_extract), 'columns from meta data') meta_data_dr = meta_data[cols_to_extract] del(dr_based_cols, cols_to_extract) -print('expected dim should be:', len(meta_data), (len(meta_data.columns)-1) ) -print('actual dim:', meta_data_dr.shape - , '\n===============================================================') +if meta_data_dr.shape[0] == len(meta_data) and meta_data_dr.shape[1] == (len(meta_data.columns)-1): + print('PASS: Dimensions match') +else: + print('FAIL: Dimensions mismatch:' + , 'Expected dim should be:', len(meta_data), (len(meta_data.columns)-1) + , '\nGot:', meta_data_dr.shape + , '\n===============================================================') +sys.exit() # Extract within this the gene of interest using string match #meta_gene_dr = meta_data.loc[meta_data[dr_muts_col].str.contains('gene_match*', na = False)] -meta_gene_dr = meta_data_dr.loc[meta_data_dr[dr_muts_col].str.contains(gene_match, na = False)] - +meta_gene_dr = meta_data_dr.loc[meta_data_dr[dr_muts_col].str.contains(gene_match, na = False)] + dr_id = meta_gene_dr['id'].unique() print('RESULT: No. of samples with dr muts in pncA:', len(dr_id)) print('checking RESULT:', @@ -338,13 +350,14 @@ if len(id_dr) == len(meta_gene_dr): else: print('FAIL: length mismatch' , '\n===============================================================') +sys.exit() dr_id = pd.Series(dr_id) #================= -# other mutations: extract gene_match entries +# other mutations: extract gene_match entries from other_muts_col #================== -print('Extracting dr_muts from:', other_muts_col,'with other meta_data') +print('Extracting other_muts from:', other_muts_col,'with other meta_data') # FIXME: replace drug with variable containing the drug name # !!! important !!! #meta_data_other = meta_data[['id' @@ -356,17 +369,24 @@ print('Extracting dr_muts from:', other_muts_col,'with other meta_data') # , other_muts_col # ]] -dr_based_cols = [drug, other_muts_col] +other_based_cols = [drug, other_muts_col] -cols_to_extract = core_cols + dr_based_cols +cols_to_extract = core_cols + other_based_cols +print('Extracting', len(cols_to_extract), 'columns from meta data') meta_data_other = meta_data[cols_to_extract] -del(dr_based_cols, cols_to_extract) +del(other_based_cols, cols_to_extract) -print('expected dim should be:', len(meta_data), (len(meta_data.columns)-1) ) -print('actual dim:', meta_data_other.shape - , '\n===============================================================') + +if meta_data_other.shape[0] == len(meta_data) and meta_data_other.shape[1] == (len(meta_data.columns)-1): + print('PASS: Dimensions match') +else: + print('FAIL: Dimensions mismatch:' + , 'Expected dim should be:', len(meta_data), (len(meta_data.columns)-1) + , '\nGot:', meta_data_other.shape + , '\n===============================================================') +sys.exit() # Extract within this the gene of interest using string match meta_gene_other = meta_data_other.loc[meta_data_other[other_muts_col].str.contains(gene_match, na = False)] @@ -383,6 +403,7 @@ if len(id_other) == len(meta_gene_other): else: print('FAIL: length mismatch' , '\n===============================================================') +sys.exit() other_id = pd.Series(other_id) #%% Find common IDs @@ -392,9 +413,12 @@ print('RESULT: No. of common ids:', common_mut_ids) # sanity checks # check if True: should be since these are common -dr_id.isin(other_id).sum() == other_id.isin(dr_id).sum() - -# check if the 24 Ids that are common are indeed the same! +if dr_id.isin(other_id).sum() == other_id.isin(dr_id).sum(): + print('PASS: Cross check on no. of common ids') +else: + sys.exit('FAIL: Cross check on no. of common ids failed') + +# check if the common are indeed the same! # bit of a tautology, but better safe than sorry! common_ids = dr_id[dr_id.isin(other_id)] common_ids = common_ids.reset_index() @@ -405,25 +429,29 @@ common_ids2 = common_ids2.reset_index() common_ids2.columns = ['index', 'id2'] # should be True -print(common_ids['id'].equals(common_ids2['id2'])) +if common_ids['id'].equals(common_ids2['id2']): + print('PASS: Further cross checks on common ids') +else: + sys.exit('FAIL: Further cross checks on common ids') # good sanity check: use it later to check gene_sample_counts expected_gene_samples = ( len(meta_gene_dr) + len(meta_gene_other) - common_mut_ids ) -print('expected no. of gene samples:', expected_gene_samples) +print('Expected no. of gene samples:', expected_gene_samples) print('=================================================================') #%% write file #print(outdir) -out_filename0 = gene.lower() + '_common_ids.csv' -outfile0 = outdir + '/' + out_filename0 +out_filename_cid = gene.lower() + '_common_ids.csv' +outfile_cid = outdir + '/' + out_filename_cid #FIXME: CHECK line len(common_ids) print('Writing file:' - , '\nFile:', outfile0 - , '\nExpected no. of rows:', len(common_ids) + , '\nFile:', outfile_cid + , '\nNo. of rows:', len(common_ids) , '\n=============================================================') -common_ids.to_csv(outfile0, index = False) -del(out_filename0) +common_ids.to_csv(outfile_cid, index = False) + +del(out_filename_cid) # clear variables del(dr_id, other_id, meta_data_dr, meta_data_other, common_ids, common_mut_ids, common_ids2) @@ -441,20 +469,19 @@ print('RESULT: actual no. of gene samples extracted:', extracted_gene_samples # sanity check: length of gene samples print('Performing sanity check:') if extracted_gene_samples == expected_gene_samples: - print('No. of gene samples:', len(meta_gene_all) - , '\nPASS: expected & actual no. of gene samples match' + print('PASS: expected & actual no. of gene samples match' + , '\nNo. of gene samples:', len(meta_gene_all) , '\n=========================================================') else: - print('FAIL: Debug please!' - , '\n===============================================================') + sys.exit('FAIL: Length mismatch in gene samples!') # count NA in drug column gene_na = meta_gene_all[drug].isna().sum() -print('No. of gene samples without pza testing i.e NA in pza column:', gene_na) +print('No. of gene samples without', drug, 'testing:', gene_na) # use it later to check number of complete samples from LF data comp_gene_samples = len(meta_gene_all) - gene_na -print('comp gene samples tested for pza:', comp_gene_samples) +print('comp gene samples tested for', drug, ':', comp_gene_samples) print('=================================================================') # Comment: This is still dirty data since these @@ -464,19 +491,9 @@ print('This is still dirty data: samples have ', gene_match, 'muts but may have , '\nsince the format for mutations is mut1; mut2, etc.' , '\n=============================================================') -#%% tidy_split():Function to split mutations on specified delimiter: ';' -#https://stackoverflow.com/questions/41476150/removing-space-from-dataframe-columns-in-pandas + print('Performing tidy_split(): to separate the mutations into indivdual rows') - - - -#TIDY SPLIT HERE - - - - - #========= # DF1: dr_muts_col #========= @@ -495,7 +512,7 @@ dr_WF0[dr_muts_col] = dr_WF0[dr_muts_col].str.lstrip() # extract only the samples/rows with gene_match dr_gene_WF0 = dr_WF0.loc[dr_WF0[dr_muts_col].str.contains(gene_match)] -print('lengths after tidy split and extracting', gene_match, 'muts:' +print('Lengths after tidy split and extracting', gene_match, 'muts:' , '\nold length:' , len(meta_gene_dr) , '\nlen after split:', len(dr_WF0) , '\ndr_gene_WF0 length:', len(dr_gene_WF0) @@ -507,6 +524,7 @@ if len(dr_gene_WF0) == dr_gene_count: else: print('FAIL: lengths mismatch' , '\n===============================================================') +sys.exit() # count the freq of 'dr_muts' samples dr_muts_df = dr_gene_WF0 [['id', dr_muts_col]] @@ -515,29 +533,42 @@ print('dim of dr_muts_df:', dr_muts_df.shape) # add freq column dr_muts_df['dr_sample_freq'] = dr_muts_df.groupby('id')['id'].transform('count') #dr_muts_df['dr_sample_freq'] = dr_muts_df.loc[dr_muts_df.groupby('id')].transform('count') -print('revised dim of dr_muts_df:', dr_muts_df.shape) +print('Revised dim of dr_muts_df:', dr_muts_df.shape) c1 = dr_muts_df.dr_sample_freq.value_counts() -print('counting no. of sample frequency:\n', c1 +print('Counting no. of sample frequency:\n', c1 , '\n===================================================================') # sanity check: length of gene samples if len(dr_gene_WF0) == c1.sum(): print('PASS: WF data has expected length' - , '\nlength of dr_gene WFO:', c1.sum() + , '\nLength of dr_gene WFO:', c1.sum() , '\n===============================================================') else: - print('FAIL: Debug please!' - , '\n===============================================================') + sys.exit('FAIL: length mismatch!') #!!! Important !!! # Assign 'column name' on which split was performed as an extra column # This is so you can identify if mutations are dr_type or other in the final df -dr_df = dr_gene_WF0.assign(mutation_info = dr_muts_col) -print('dim of dr_df:', dr_df.shape +dr_df_all = dr_gene_WF0.assign(mutation_info = dr_muts_col) +print('Dim of dr_df:', dr_df_all.shape , '\n==============================================================' , '\nEnd of tidy split() on dr_muts, and added an extra column relecting mut_category' , '\n===============================================================') + +# Further clean up to remove stop mutations as these are similar in format to +# gene match but dont have a 'mutant aa' (for e.g: abc10* vs nssnp: abc10cde) +dr_stop = dr_df_all[dr_muts_col].str.count('\*').sum() +print('Removing stop mutations denoted by an asterix' + , '\nNo. of stop mutations:',dr_stop) + +dr_df = dr_df_all[~dr_df_all[dr_muts_col].str.contains('\*')] + +if len(dr_df) == len(dr_df_all) - dr_stop: + print('PASS: Successfully removed stop mutatiosn from dr_df') +else: + sys.exit('FAIL: Could not remove stop mutations. Check regex or variable names?') + #%% #========= # DF2: other_mutations_pyrazinamdie @@ -558,7 +589,7 @@ other_WF1[other_muts_col] = other_WF1[other_muts_col].str.strip() # extract only the samples/rows with gene_match other_gene_WF1 = other_WF1.loc[other_WF1[other_muts_col].str.contains(gene_match)] -print('lengths after tidy split and extracting', gene_match, 'muts:', +print('Lengths after tidy split and extracting', gene_match, 'muts:', '\nold length:' , len(meta_gene_other), '\nlen after split:', len(other_WF1), '\nother_gene_WF1 length:', len(other_gene_WF1), @@ -568,36 +599,50 @@ if len(other_gene_WF1) == other_gene_count: print('PASS: length matches with expected length' , '\n===============================================================') else: - print('FAIL: lengths mismatch' - , '\n===============================================================') + sys.exit('FAIL: lengths mismatch') + # count the freq of 'other muts' samples other_muts_df = other_gene_WF1 [['id', other_muts_col]] -print('dim of other_muts_df:', other_muts_df.shape) +print('Dim of other_muts_df:', other_muts_df.shape) # add freq column other_muts_df['other_sample_freq'] = other_muts_df.groupby('id')['id'].transform('count') -print('revised dim of other_muts_df:', other_muts_df.shape) +print('Revised dim of other_muts_df:', other_muts_df.shape) c2 = other_muts_df.other_sample_freq.value_counts() -print('counting no. of sample frequency:\n', c2) +print('Counting no. of sample frequency:\n', c2) print('=================================================================') # sanity check: length of gene samples if len(other_gene_WF1) == c2.sum(): print('PASS: WF data has expected length' - , '\nlength of other_gene WFO:', c2.sum() + , '\nLength of other_gene WFO:', c2.sum() , '\n===============================================================') else: - print('FAIL: Debug please!' - , '\n===============================================================') - + sys.exit('FAIL: Length mismatch') + #!!! Important !!! # Assign 'column name' on which split was performed as an extra column # This is so you can identify if mutations are dr_type or other in the final df -other_df = other_gene_WF1.assign(mutation_info = other_muts_col) -print('dim of other_df:', other_df.shape +other_df_all = other_gene_WF1.assign(mutation_info = other_muts_col) +print('dim of other_df:', other_df_all.shape , '\n===============================================================' , '\nEnd of tidy split() on other_muts, and added an extra column relecting mut_category' , '\n===============================================================') + + +# Further clean up to remove stop mutations as these are similar in format to +# gene match but don't have a 'mutant aa' (for e.g: abc10* vs nssnp: abc10cde) +other_stop = other_df_all[other_muts_col].str.count('\*').sum() +print('Removing stop mutations denoted by an asterix' + , '\nNo. of stop mutations:',other_stop) + +other_df = other_df_all[~other_df_all[other_muts_col].str.contains('\*')] + +if len(other_df) == len(other_df_all) - other_stop: + print('PASS: Successfully removed stop mutatiosn from other_df') +else: + sys.exit('FAIL: Could not remove stop mutations. Check regex or variable names?') + #%% #========== # Concatentating the two dfs: equivalent of rbind in R @@ -606,8 +651,8 @@ print('dim of other_df:', other_df.shape # change column names to allow concat: # dr_muts.. & other_muts : 'mutation' print('Now concatenating the two dfs by row' - , '\nfirst assigning a common colname: "mutation" to the col containing muts' - , '\nthis is done for both dfs' + , '\nFirst assigning a common colname: "mutation" to the col containing muts' + , '\nThis is done for both dfs' , '\n===================================================================') dr_df.columns @@ -618,39 +663,58 @@ other_df.columns other_df.rename(columns = {other_muts_col: 'mutation'}, inplace = True) other_df.columns -print('Now appending the two dfs:' - , '\ndr_df dim:', dr_df.shape - , '\nother_df dim:', other_df.shape - , '\ndr_df length:', len(dr_df) - , '\nother_df length:', len(other_df) - , '\nexpected length:', len(dr_df) + len(other_df) - , '\n=============================================================') +if len(dr_df.columns) == len(other_df.columns): + print('Checking dfs for concatening by rows:' + , '\ndr_df dim:', dr_df.shape + , '\nother_df dim:', other_df.shape + , '\nexpected nrows:', len(dr_df) + len(other_df) + , '\n=============================================================') +else: + sys.exit('FAIL: No. of cols mismatch for concatenating') # checking colnames before concat print('checking colnames BEFORE concatenating the two dfs...') if (set(dr_df.columns) == set(other_df.columns)): print('PASS: column names match necessary for merging two dfs') else: - print('FAIL: Debug please!') + sys.exit('FAIL: Colnames mismatch for concatenating!') # concatenate (axis = 0): Rbind -gene_LF0 = pd.concat([dr_df, other_df], ignore_index = True, axis = 0) +print('Now appending the two dfs: Rbind') +gene_LF_comb = pd.concat([dr_df, other_df], ignore_index = True, axis = 0) + +print('Finding stop mutations in concatenated df') +stop_muts = gene_LF_comb['mutation'].str.contains('\*').sum() +stop_muts_group = gene_LF_comb['mutation'].str.counts('\*').value_counts +gene_LF_comb.groupby(['mutation_info'])['mutation'].apply(lambda x: x[x.str.contains('\*')].count()) + + +gene_LF0 = gene_LF_comb[gene_LF_comb['mutation'].str.contains(nssnp_match, case = False)] + + + + + + + + + # checking colnames and length after concat print('checking colnames AFTER concatenating the two dfs...') if (set(dr_df.columns) == set(gene_LF0.columns)): - print('PASS: column names match') + print('PASS: column names match + , '\n=============================================================') else: - print('FAIL: Debug please!') + sys.exit('FAIL: Colnames mismatch AFTER concatenating') -print('checking length AFTER concatenating the two dfs...') - +print('checking concatened df') if len(gene_LF0) == len(dr_df) + len(other_df): print('PASS:length of df after concat match' , '\n===============================================================') else: - print('FAIL: length mismatch' - , '\n===============================================================') + sys.exit('FAIL: length mismatch') + #%% ########### # This is hopefully clean data, but just double check @@ -658,13 +722,37 @@ else: # mutations corresponding to gene_match* (string match pattern) # this will be your list you run OR calcs ########### +#my_regex = 'pncA_p.[A-Z]{3}[0-9]+[A-Z]{3}' + + +gene_LF1 = gene_LF0[gene_LF0['mutation'].str.contains(nssnp_match, case = False)] +len(gene_LF1) + + + + + + + + + + + + + print('length of gene_LF0:', len(gene_LF0), '\nThis should be what you need. But just double check and extract', gene_match, - '\nfrom LF0 (concatenated data) using string match:', gene_match) + '\nfrom LF0 (concatenated data) using string match:', gene_match + , '\nand double checking for stop mutations') + +if gene_LF0['mutation'].str.count('\*').sum() > 0: + sys.exit('FAIL: Stop mutations detected post concatenating dfs. Resolve at source!') print('Double checking and creating df: gene_LF1') gene_LF1 = gene_LF0[gene_LF0['mutation'].str.contains(gene_match)] + + if len(gene_LF0) == len(gene_LF1): print('PASS: length of gene_LF0 and gene_LF1 match', '\nconfirming extraction and concatenating worked correctly') diff --git a/scripts/tidy_split.py b/scripts/tidy_split.py index 07630bb..3685557 100644 --- a/scripts/tidy_split.py +++ b/scripts/tidy_split.py @@ -18,7 +18,9 @@ import pandas as pd #os.chdir(homedir + '/git/LSHTM_analysis/scripts') #os.getcwd() #%%===================================================================== -# define the split function +# tidy_split():Function to split mutations on specified delimiter: ';' +#https://stackoverflow.com/questions/41476150/removing-space-from-dataframe-columns-in-pandas + def tidy_split(df, column, sep = '|', keep = False): ''' Split the values of a column and expand so the new DataFrame has one split