data extraction tidy up

2020-07-08 13:26:33 +01:00 · 2020-07-08 13:26:33 +01:00 · 65e6b28d9e
commit 65e6b28d9e
parent e4328df255
2 changed files with 195 additions and 105 deletions
--- a/scripts/data_extraction.py
+++ b/scripts/data_extraction.py
@ -50,6 +50,7 @@ Created on Tue Aug  6 12:56:03 2019
 #=======================================================================
 #%% load libraries
 import os, sys
+import re
 import pandas as pd
 #import numpy as np
 import argparse
@ -72,12 +73,14 @@ arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', defau
 args = arg_parser.parse_args()
 #=======================================================================
 #%% variable assignment: input and output paths & filenames
+drug = args.drug
+gene = args.gene
+
 #drug = 'pyrazinamide'
 #gene = 'pncA'

-drug = args.drug
-gene = args.gene
 gene_match = gene + '_p.'
+nssnp_match = gene_match+'[A-Z]{3}[0-9]+[A-Z]{3}'

 # building cols to extract
 dr_muts_col = 'dr_mutations_' + drug
@ -93,14 +96,16 @@ print('Extracting columns based on variables:\n'
 #=======================================================================
 #%% input and output dirs and files
 #=======
-# data dir
+# dirs
 #=======
 datadir = homedir + '/' + 'git/Data'
+indir = datadir + '/' + drug + '/' + 'input'
+outdir = datadir + '/' + drug + '/' + 'output'

 #=======
 # input
 #=======
-#in_filename  = 'original_tanushree_data_v2.csv' #19k
+in_filename  = 'original_tanushree_data_v2.csv' #19k
 in_filename  = 'mtb_gwas_meta_v3.csv' #33k
 infile = datadir + '/' + in_filename
 print('Input file: ', infile
@ -109,9 +114,7 @@ print('Input file: ', infile
 #=======
 # output
 #=======
-# several output files
-# output filenames in respective sections at the time of outputting files
-outdir = datadir + '/' + drug + '/' + 'output'
+# several output files: in respective sections at the time of outputting files
 print('Output filename: in the respective sections'
      , '\nOutput path: ', outdir
      , '\n=============================================================')
@ -126,15 +129,14 @@ master_data  = pd.read_csv(infile, sep = ',')

 # extract elevant columns to extract from meta data related to the drug

-#meta_data_ch = master_data[['id'
-#, 'country'
-#, 'lineage'
-#, 'sublineage'
-##, 'drtype' #19k only
-#, 'resistance'
-#, drug
-#, dr_muts_col
-#, other_muts_col]] 
+meta_data = master_data[['id'
+, 'country'
+, 'lineage'
+, 'sublineage'
+, 'drtype' #19k only
+, drug
+, dr_muts_col
+, other_muts_col]] 


 core_cols = ['id'
@ -160,6 +162,7 @@ variable_based_cols = [drug
                       , other_muts_col]
       
 cols_to_extract = core_cols + variable_based_cols
+print('Extracting', len(cols_to_extract), 'columns from master data')

 meta_data = master_data[cols_to_extract]  
   
@ -174,18 +177,21 @@ print('RESULT: Total samples:', total_samples
 meta_data.isna().sum()
 print('No. of NAs/column:' + '\n', meta_data.isna().sum()
 		, '\n===========================================================')
-
+#
 # glance
-meta_data.head()
+#meta_data.head()
+#total_samples - NA pyrazinamide = ?
+# 19K:  19265-6754 = 12511
+# 33K: 33681 - 23823 = 9858

 # equivalent of table in R
-# drug counts
+# drug counts: complete samples for OR calcs
 meta_data[drug].value_counts() 
 print('RESULT: Sus and Res samples:\n', meta_data[drug].value_counts()
 		, '\n===========================================================')

 # clear variables
-del(in_filename,infile)
+del(in_filename, infile)
 #del(outdir)
 #%%
 # !!!IMPORTANT!!! sanity check:
@ -208,11 +214,10 @@ na_count = meta_data[dr_muts_col].isna().sum()

 if len(clean_df) == (total_samples - na_count):
   print('PASS: clean_df extracted: length is', len(clean_df)
-         , '\nNo.of NAs =', na_count, '/', total_samples
+         , '\nNo.of NAs in', dr_muts_col, '=', na_count, '/', total_samples
         , '\n==========================================================')
 else:
-    print('FAIL: dropping NA failed'
-         , '\n==========================================================')
+    sys.exit('FAIL: Could not drop NA')

 dr_gene_count = 0
 wt = 0
@ -238,7 +243,7 @@ print('Total matches of', gene_match, 'in dr_muts_col:', dr_gene_count)
 print('Total samples with > 1', gene_match, 'muts in dr_muts_col:', len(id2_dr) )
 print('=================================================================')

-del(i, id, wt, id2_dr, clean_df, na_count, count_gene_dr, count_wt)
+del(clean_df, na_count, i, id, wt, id2_dr, count_gene_dr, count_wt)

 #========
 # Second: counting <gene_match> mutations in dr_muts_col column
@ -258,6 +263,7 @@ if len(clean_df) == (total_samples - na_count):
 else:
    print('FAIL: dropping NA failed'
         , '\n=========================================================')
+sys.exit()

 other_gene_count = 0
 wt_other = 0
@ -268,7 +274,7 @@ for i, id in enumerate(clean_df.id):
 #    print (i, id)
 #    id_other.append(id)
 #    count_gene_other = clean_df[other_muts_col].iloc[i].count('gene_match')
-    count_gene_other = clean_df[other_muts_col].iloc[i].count(gene_match)
+     count_gene_other = clean_df[other_muts_col].iloc[i].count(gene_match)
    if count_gene_other > 0:
        id_other.append(id)            
    if count_gene_other > 1:
@ -313,19 +319,25 @@ print('gene to extract:', gene_match )
 dr_based_cols = [drug, dr_muts_col]
       
 cols_to_extract = core_cols + dr_based_cols
+print('Extracting', len(cols_to_extract), 'columns from meta data')

 meta_data_dr = meta_data[cols_to_extract]

 del(dr_based_cols, cols_to_extract)

-print('expected dim should be:', len(meta_data), (len(meta_data.columns)-1) )
-print('actual dim:', meta_data_dr.shape
-	, '\n===============================================================')
+if meta_data_dr.shape[0] == len(meta_data) and meta_data_dr.shape[1] == (len(meta_data.columns)-1):
+    print('PASS: Dimensions match')
+else:
+    print('FAIL: Dimensions mismatch:'
+          , 'Expected dim should be:', len(meta_data), (len(meta_data.columns)-1)
+          , '\nGot:', meta_data_dr.shape
+	      , '\n===============================================================')
+sys.exit()

 # Extract within this the gene of interest using string match
 #meta_gene_dr = meta_data.loc[meta_data[dr_muts_col].str.contains('gene_match*', na = False)] 
-meta_gene_dr = meta_data_dr.loc[meta_data_dr[dr_muts_col].str.contains(gene_match, na = False)] 
-
+meta_gene_dr = meta_data_dr.loc[meta_data_dr[dr_muts_col].str.contains(gene_match, na = False)]
+                                             
 dr_id = meta_gene_dr['id'].unique()
 print('RESULT: No. of samples with dr muts in pncA:', len(dr_id))
 print('checking RESULT:',
@ -338,13 +350,14 @@ if len(id_dr) == len(meta_gene_dr):
 else:
    print('FAIL: length mismatch'
    , '\n===============================================================')
+sys.exit()

 dr_id = pd.Series(dr_id)

 #=================
-# other mutations: extract gene_match entries
+# other mutations: extract gene_match entries from other_muts_col
 #==================
-print('Extracting dr_muts from:', other_muts_col,'with other meta_data')
+print('Extracting other_muts from:', other_muts_col,'with other meta_data')
 # FIXME: replace drug with variable containing the drug name
 # !!! important !!!
 #meta_data_other = meta_data[['id'
@ -356,17 +369,24 @@ print('Extracting dr_muts from:', other_muts_col,'with other meta_data')
 #       , other_muts_col
 #        ]] 

-dr_based_cols = [drug, other_muts_col]
+other_based_cols = [drug, other_muts_col]
       
-cols_to_extract = core_cols + dr_based_cols
+cols_to_extract = core_cols + other_based_cols
+print('Extracting', len(cols_to_extract), 'columns from meta data')

 meta_data_other = meta_data[cols_to_extract]

-del(dr_based_cols, cols_to_extract)
+del(other_based_cols, cols_to_extract)

-print('expected dim should be:', len(meta_data), (len(meta_data.columns)-1) )
-print('actual dim:', meta_data_other.shape
-	, '\n===============================================================')
+
+if meta_data_other.shape[0] == len(meta_data) and meta_data_other.shape[1] == (len(meta_data.columns)-1):
+    print('PASS: Dimensions match')
+else:
+    print('FAIL: Dimensions mismatch:'
+          , 'Expected dim should be:', len(meta_data), (len(meta_data.columns)-1)
+          , '\nGot:', meta_data_other.shape
+	      , '\n===============================================================')
+sys.exit()

 # Extract within this the gene of interest using string match
 meta_gene_other  = meta_data_other.loc[meta_data_other[other_muts_col].str.contains(gene_match, na = False)] 
@ -383,6 +403,7 @@ if len(id_other) == len(meta_gene_other):
 else:
    print('FAIL: length mismatch'
    , '\n===============================================================')
+sys.exit()

 other_id = pd.Series(other_id)
 #%% Find common IDs
@ -392,9 +413,12 @@ print('RESULT: No. of common ids:', common_mut_ids)

 # sanity checks
 # check if True: should be since these are common
-dr_id.isin(other_id).sum() == other_id.isin(dr_id).sum()
-
-# check if the 24 Ids that are common are indeed the same!
+if dr_id.isin(other_id).sum() == other_id.isin(dr_id).sum():
+    print('PASS: Cross check on no. of common ids')
+else:
+    sys.exit('FAIL: Cross check on no. of common ids failed')
+    
+# check if the common are indeed the same!
 # bit of a tautology, but better safe than sorry!
 common_ids = dr_id[dr_id.isin(other_id)]
 common_ids = common_ids.reset_index()
@ -405,25 +429,29 @@ common_ids2 = common_ids2.reset_index()
 common_ids2.columns = ['index', 'id2']

 # should be True
-print(common_ids['id'].equals(common_ids2['id2']))
+if common_ids['id'].equals(common_ids2['id2']):
+    print('PASS: Further cross checks on common ids')
+else:
+    sys.exit('FAIL: Further cross checks on common ids')

 # good sanity check: use it later to check gene_sample_counts
 expected_gene_samples = ( len(meta_gene_dr) + len(meta_gene_other) - common_mut_ids ) 
-print('expected no. of gene samples:', expected_gene_samples)
+print('Expected no. of gene samples:', expected_gene_samples)
 print('=================================================================')
 #%% write file
 #print(outdir)
-out_filename0 = gene.lower() + '_common_ids.csv'
-outfile0 =  outdir + '/' + out_filename0
+out_filename_cid = gene.lower() + '_common_ids.csv'
+outfile_cid =  outdir + '/' + out_filename_cid

 #FIXME: CHECK line len(common_ids)
 print('Writing file:'
-      , '\nFile:', outfile0
-      , '\nExpected no. of rows:', len(common_ids)
+      , '\nFile:', outfile_cid
+      , '\nNo. of rows:', len(common_ids)
      , '\n=============================================================')

-common_ids.to_csv(outfile0, index = False)
-del(out_filename0)
+common_ids.to_csv(outfile_cid, index = False)
+
+del(out_filename_cid)

 # clear variables
 del(dr_id, other_id, meta_data_dr, meta_data_other, common_ids, common_mut_ids, common_ids2)
@ -441,20 +469,19 @@ print('RESULT: actual no. of gene samples extracted:', extracted_gene_samples
 # sanity check: length of gene samples
 print('Performing sanity check:')
 if extracted_gene_samples == expected_gene_samples:
-    print('No. of gene samples:', len(meta_gene_all)
-          , '\nPASS: expected & actual no. of gene samples match'
+    print('PASS: expected & actual no. of gene samples match'
+          , '\nNo. of gene samples:', len(meta_gene_all)
          , '\n=========================================================')
 else:
-    print('FAIL: Debug please!'
-    , '\n===============================================================')
+    sys.exit('FAIL: Length mismatch in gene samples!')

 # count NA  in drug column
 gene_na = meta_gene_all[drug].isna().sum() 
-print('No. of gene samples without pza testing i.e NA in pza column:', gene_na)
+print('No. of gene samples without', drug, 'testing:', gene_na)

 # use it later to check number of complete samples from LF data
 comp_gene_samples = len(meta_gene_all) - gene_na
-print('comp gene samples tested for pza:', comp_gene_samples)
+print('comp gene samples tested for', drug, ':', comp_gene_samples)
 print('=================================================================')

 # Comment: This is still dirty data since these
@ -464,19 +491,9 @@ print('This is still dirty data: samples have ', gene_match, 'muts but may have
      , '\nsince the format for mutations is mut1; mut2, etc.'
      , '\n=============================================================')

-#%% tidy_split():Function to split mutations on specified delimiter: ';'
-#https://stackoverflow.com/questions/41476150/removing-space-from-dataframe-columns-in-pandas
+
 print('Performing tidy_split(): to separate the mutations into indivdual rows')

-
-
-
-#TIDY SPLIT HERE
-
-
-
-
-
 #=========
 # DF1: dr_muts_col
 #=========
@ -495,7 +512,7 @@ dr_WF0[dr_muts_col] = dr_WF0[dr_muts_col].str.lstrip()
 # extract only the samples/rows with gene_match
 dr_gene_WF0 = dr_WF0.loc[dr_WF0[dr_muts_col].str.contains(gene_match)]

-print('lengths after tidy split and extracting', gene_match, 'muts:'
+print('Lengths after tidy split and extracting', gene_match, 'muts:'
      , '\nold length:' , len(meta_gene_dr)
      , '\nlen after split:', len(dr_WF0)
      , '\ndr_gene_WF0 length:', len(dr_gene_WF0)
@ -507,6 +524,7 @@ if len(dr_gene_WF0) == dr_gene_count:
 else:
    print('FAIL: lengths mismatch'
    , '\n===============================================================')
+sys.exit()

 # count the freq of 'dr_muts' samples
 dr_muts_df = dr_gene_WF0 [['id', dr_muts_col]] 
@ -515,29 +533,42 @@ print('dim of dr_muts_df:', dr_muts_df.shape)
 # add freq column
 dr_muts_df['dr_sample_freq'] = dr_muts_df.groupby('id')['id'].transform('count') 
 #dr_muts_df['dr_sample_freq'] = dr_muts_df.loc[dr_muts_df.groupby('id')].transform('count')
-print('revised dim of dr_muts_df:', dr_muts_df.shape) 
+print('Revised dim of dr_muts_df:', dr_muts_df.shape) 

 c1 = dr_muts_df.dr_sample_freq.value_counts()
-print('counting no. of sample frequency:\n', c1
+print('Counting no. of sample frequency:\n', c1
      , '\n===================================================================')

 # sanity check: length of gene samples
 if len(dr_gene_WF0) == c1.sum():
    print('PASS: WF data has expected length'
-    , '\nlength of dr_gene WFO:', c1.sum()
+    , '\nLength of dr_gene WFO:', c1.sum()
    , '\n===============================================================')
 else:
-    print('FAIL: Debug please!'
-    , '\n===============================================================')
+    sys.exit('FAIL: length mismatch!')

 #!!! Important !!!
 # Assign 'column name' on which split was performed as an extra column
 # This is so you can identify if mutations are dr_type or other in the final df
-dr_df = dr_gene_WF0.assign(mutation_info = dr_muts_col)
-print('dim of dr_df:', dr_df.shape
+dr_df_all = dr_gene_WF0.assign(mutation_info = dr_muts_col)
+print('Dim of dr_df:', dr_df_all.shape
 	, '\n=============================================================='
 	, '\nEnd of tidy split() on dr_muts, and added an extra column relecting mut_category'
 	, '\n===============================================================')
+
+# Further clean up to remove stop mutations as these are similar in format to
+# gene match but dont have a 'mutant aa' (for e.g: abc10* vs nssnp: abc10cde)
+dr_stop = dr_df_all[dr_muts_col].str.count('\*').sum()
+print('Removing stop mutations denoted by an asterix'
+      , '\nNo. of stop mutations:',dr_stop)
+
+dr_df = dr_df_all[~dr_df_all[dr_muts_col].str.contains('\*')]
+
+if len(dr_df) == len(dr_df_all) - dr_stop:
+    print('PASS: Successfully removed stop mutatiosn from dr_df')
+else:
+    sys.exit('FAIL: Could not remove stop mutations. Check regex or variable names?')
+
 #%%
 #=========
 # DF2: other_mutations_pyrazinamdie
@ -558,7 +589,7 @@ other_WF1[other_muts_col] = other_WF1[other_muts_col].str.strip()
 # extract only the samples/rows with gene_match
 other_gene_WF1 = other_WF1.loc[other_WF1[other_muts_col].str.contains(gene_match)]

-print('lengths after tidy split and extracting', gene_match, 'muts:',
+print('Lengths after tidy split and extracting', gene_match, 'muts:',
      '\nold length:' , len(meta_gene_other),
      '\nlen after split:', len(other_WF1),
      '\nother_gene_WF1 length:', len(other_gene_WF1),
@ -568,36 +599,50 @@ if len(other_gene_WF1) == other_gene_count:
    print('PASS: length matches with expected length'
    , '\n===============================================================')
 else:
-    print('FAIL: lengths mismatch'
-    , '\n===============================================================')    
+    sys.exit('FAIL: lengths mismatch')
+
 # count the freq of 'other muts' samples
 other_muts_df = other_gene_WF1 [['id', other_muts_col]] 
-print('dim of other_muts_df:', other_muts_df.shape) 
+print('Dim of other_muts_df:', other_muts_df.shape) 

 # add freq column
 other_muts_df['other_sample_freq'] = other_muts_df.groupby('id')['id'].transform('count')
-print('revised dim of other_muts_df:', other_muts_df.shape) 
+print('Revised dim of other_muts_df:', other_muts_df.shape) 

 c2 = other_muts_df.other_sample_freq.value_counts()
-print('counting no. of sample frequency:\n', c2)
+print('Counting no. of sample frequency:\n', c2)
 print('=================================================================')
 # sanity check: length of gene samples
 if len(other_gene_WF1) == c2.sum():
    print('PASS: WF data has expected length'
-    , '\nlength of other_gene WFO:', c2.sum()
+    , '\nLength of other_gene WFO:', c2.sum()
    , '\n===============================================================')
 else:
-    print('FAIL: Debug please!'
-    , '\n===============================================================')
-
+    sys.exit('FAIL: Length mismatch')
+    
 #!!! Important !!!
 # Assign 'column name' on which split was performed as an extra column
 # This is so you can identify if mutations are dr_type or other in the final df
-other_df = other_gene_WF1.assign(mutation_info = other_muts_col)
-print('dim of other_df:', other_df.shape
+other_df_all = other_gene_WF1.assign(mutation_info = other_muts_col)
+print('dim of other_df:', other_df_all.shape
 	, '\n==============================================================='
 	, '\nEnd of tidy split() on other_muts, and added an extra column relecting mut_category'
 	, '\n===============================================================')
+
+
+# Further clean up to remove stop mutations as these are similar in format to
+# gene match but don't have a 'mutant aa' (for e.g: abc10* vs nssnp: abc10cde)
+other_stop = other_df_all[other_muts_col].str.count('\*').sum()
+print('Removing stop mutations denoted by an asterix'
+      , '\nNo. of stop mutations:',other_stop)
+
+other_df = other_df_all[~other_df_all[other_muts_col].str.contains('\*')]
+
+if len(other_df) == len(other_df_all) - other_stop:
+    print('PASS: Successfully removed stop mutatiosn from other_df')
+else:
+    sys.exit('FAIL: Could not remove stop mutations. Check regex or variable names?')
+
 #%%
 #==========
 # Concatentating the two dfs: equivalent of rbind in R
@ -606,8 +651,8 @@ print('dim of other_df:', other_df.shape
 # change column names to allow concat:
 # dr_muts.. & other_muts : 'mutation'
 print('Now concatenating the two dfs by row'
-      , '\nfirst assigning a common colname: "mutation" to the col containing muts'
-      , '\nthis is done for both dfs'
+      , '\nFirst assigning a common colname: "mutation" to the col containing muts'
+      , '\nThis is done for both dfs'
      , '\n===================================================================')

 dr_df.columns
@ -618,39 +663,58 @@ other_df.columns
 other_df.rename(columns = {other_muts_col: 'mutation'}, inplace = True)
 other_df.columns

-print('Now appending the two dfs:'
-      , '\ndr_df dim:', dr_df.shape
-      , '\nother_df dim:', other_df.shape
-      , '\ndr_df length:', len(dr_df)
-      , '\nother_df length:', len(other_df)
-      , '\nexpected length:', len(dr_df) +  len(other_df)
-      , '\n=============================================================')
+if len(dr_df.columns) == len(other_df.columns):
+    print('Checking dfs for concatening by rows:'
+          , '\ndr_df dim:', dr_df.shape
+          , '\nother_df dim:', other_df.shape
+          , '\nexpected nrows:', len(dr_df) +  len(other_df)
+          , '\n=============================================================')
+else:
+    sys.exit('FAIL: No. of cols mismatch for concatenating')

 # checking colnames before concat
 print('checking colnames BEFORE concatenating the two dfs...')
 if (set(dr_df.columns) == set(other_df.columns)):
    print('PASS: column names match necessary for merging two dfs')
 else:
-    print('FAIL: Debug please!')
+    sys.exit('FAIL: Colnames mismatch for concatenating!')

 # concatenate (axis = 0): Rbind
-gene_LF0 = pd.concat([dr_df, other_df], ignore_index = True, axis = 0)
+print('Now appending the two dfs: Rbind')
+gene_LF_comb = pd.concat([dr_df, other_df], ignore_index = True, axis = 0)
+
+print('Finding stop mutations in concatenated df')
+stop_muts = gene_LF_comb['mutation'].str.contains('\*').sum()
+stop_muts_group = gene_LF_comb['mutation'].str.counts('\*').value_counts
+gene_LF_comb.groupby(['mutation_info'])['mutation'].apply(lambda x: x[x.str.contains('\*')].count())
+
+
+gene_LF0 = gene_LF_comb[gene_LF_comb['mutation'].str.contains(nssnp_match, case = False)]
+
+
+
+
+
+
+
+
+

 # checking colnames and length after concat
 print('checking colnames AFTER concatenating the two dfs...')
 if (set(dr_df.columns) == set(gene_LF0.columns)):
-    print('PASS: column names match')
+    print('PASS: column names match
+          , '\n=============================================================')
 else:
-    print('FAIL: Debug please!')
+    sys.exit('FAIL: Colnames mismatch AFTER concatenating')
    
-print('checking length AFTER concatenating the two dfs...')
-
+print('checking concatened df')
 if len(gene_LF0) == len(dr_df) +  len(other_df):
    print('PASS:length of df after concat match'
    , '\n===============================================================')
 else:
-    print('FAIL: length mismatch'
-    , '\n===============================================================')
+    sys.exit('FAIL: length mismatch')
+
 #%%
 ###########
 # This is hopefully clean data, but just double check
@ -658,13 +722,37 @@ else:
 # mutations corresponding to gene_match* (string match pattern) 
 # this will be your list you run OR calcs 
 ###########
+#my_regex = 'pncA_p.[A-Z]{3}[0-9]+[A-Z]{3}'
+
+
+gene_LF1 = gene_LF0[gene_LF0['mutation'].str.contains(nssnp_match, case = False)]  
+len(gene_LF1)
+
+
+
+
+
+
+
+
+
+
+
+
+
 print('length of gene_LF0:', len(gene_LF0),
      '\nThis should be what you need. But just double check and extract', gene_match, 
-      '\nfrom LF0 (concatenated data) using string match:', gene_match)
+      '\nfrom LF0 (concatenated data) using string match:', gene_match
+      , '\nand double checking for stop mutations')
+
+if gene_LF0['mutation'].str.count('\*').sum() > 0:
+    sys.exit('FAIL: Stop mutations detected post concatenating dfs. Resolve at source!')

 print('Double checking and creating df: gene_LF1')
 gene_LF1 = gene_LF0[gene_LF0['mutation'].str.contains(gene_match)]   

+
+
 if len(gene_LF0) == len(gene_LF1):
    print('PASS: length of gene_LF0 and gene_LF1 match',
          '\nconfirming extraction and concatenating worked correctly')