tidying script

2020-06-25 13:12:09 +01:00 · 2020-06-25 13:12:09 +01:00 · 3c6122a296
commit 3c6122a296
parent b82cc11dbe
1 changed files with 133 additions and 197 deletions
--- a/scripts/combine_afs_ors.py
+++ b/scripts/combine_afs_ors.py
@ -30,41 +30,46 @@ os.chdir(homedir + '/git/LSHTM_analysis/scripts')
 os.getcwd()

 # local import
-from reference_dict import my_aa_dict # CHECK DIR STRUC THERE!
+#from reference_dict import my_aa_dict # CHECK DIR STRUC THERE!
+from reference_dict import low_3letter_dict
 #=======================================================================
 #%% command line args
-arg_parser = argparse.ArgumentParser()
-arg_parser.add_argument('-d', '--drug', help = 'drug name', default = 'pyrazinamide')
-arg_parser.add_argument('-g', '--gene', help = 'gene name', default = 'pncA') # case sensitive
-args = arg_parser.parse_args()
+#arg_parser = argparse.ArgumentParser()
+#arg_parser.add_argument('-d', '--drug', help = 'drug name', default = 'pyrazinamide')
+#arg_parser.add_argument('-g', '--gene', help = 'gene name', default = 'pncA') # case sensitive
+#args = arg_parser.parse_args()
 #=======================================================================
 #%% variable assignment: input and output 
-#drug = 'pyrazinamide'
-#gene = 'pncA'
-#gene_match = gene + '_p.'
+drug = 'pyrazinamide'
+gene = 'pncA'
+gene_match = gene + '_p.'

 # cmd variables
-drug = args.drug
-gene = args.gene
-gene_match = gene + '_p.'
+#drug = args.drug
+#gene = args.gene
+#gene_match = gene + '_p.'

 #==========
 # dir
 #==========
 datadir = homedir + '/' + 'git/Data'
+indir = datadir + '/' + drug + '/' + 'input'
 outdir = datadir + '/' + drug + '/' + 'output'

 #=======
 # input
 #=======
+in_filename_snpinfo = 'ns' + gene.lower() + '_snp_info.csv'
 in_filename_afor = gene.lower() + '_af_or.csv'
 in_filename_afor_kin = gene.lower() + '_af_or_kinship.csv'

+infile0 = indir + '/' + in_filename_snpinfo 
 infile1 = outdir + '/' + in_filename_afor
 infile2 = outdir + '/' + in_filename_afor_kin


-print('Input file1:', infile1
+print('Input file0:', infile0
+      , '\nInput file1:', infile1
      , '\nInput file2:', infile2
      , '\n===================================================================')

@ -77,7 +82,7 @@ print('Output file:', outfile
      , '\n===================================================================')


-del(in_filename_afor, in_filename_afor_kin, datadir, outdir)
+del(in_filename_afor, in_filename_afor_kin, datadir, indir, outdir)
 #%% end of variable assignment for input and output files
 #=======================================================================
 #%% format mutations
@ -86,211 +91,142 @@ del(in_filename_afor, in_filename_afor_kin, datadir, outdir)
 #========================
 # read input csv files to combine
 #========================
+snpinfo_df = pd.read_csv(infile0, sep = ',')
+snpinfo_ncols = len(snpinfo_df.columns) 
+snpinfo_nrows = len(snpinfo_df) 
+print('No. of rows in', infile0, ':', snpinfo_nrows
+      , '\nNo. of cols in', infile0, ':', snpinfo_ncols)
+
 afor_df = pd.read_csv(infile1, sep = ',')
-afor_df_ncols = len(afor_df.columns) 
-afor_df_nrows = len(afor_df) 
-print('No. of rows in', infile1, ':', afor_df_nrows
-      , '\nNo. of cols in', infile1, ':', afor_df_ncols)
+afor_ncols = len(afor_df.columns) 
+afor_nrows = len(afor_df) 
+print('No. of rows in', infile1, ':', afor_nrows
+      , '\nNo. of cols in', infile1, ':', afor_ncols)

 afor_kin_df = pd.read_csv(infile2, sep = ',')
-afor_kin_df_nrows = len(afor_kin_df)
-afor_kin_df_ncols = len(afor_kin_df.columns)
-print('No. of rows in', infile2, ':', afor_kin_df_nrows
-      , '\nNo. of cols in', infile2, ':', afor_kin_df_ncols)
+afor_kin_nrows = len(afor_kin_df)
+afor_kin_ncols = len(afor_kin_df.columns)
+print('No. of rows in', infile2, ':', afor_kin_nrows
+      , '\nNo. of cols in', infile2, ':', afor_kin_ncols)

-#=======
-# Iterate through the dict, create a lookup dict i.e
-# lookup_dict = {three_letter_code: one_letter_code}.
-# lookup dict should be the key and the value (you want to create a column for)
-# Then use this to perform the mapping separetly for wild type and mutant cols.
-# The three letter code is extracted using a string match match from the dataframe and then converted
-# to 'pandas series'since map only works in pandas series
-#=======
-gene_regex = gene_match.lower()+'(\w{3})'
-print('gene regex being used:', gene_regex)
+#%% Process afor_df
+#1) pull all snp_info so you have ref_allele, etc
+# i.e merge afor_df and snpinfo_df
+# find merging column

-# initialise a sub dict that is lookup dict for three letter code to 1-letter code
-# adding three more cols
-lookup_dict = dict()
-for k, v in my_aa_dict.items():
-    lookup_dict[k] = v['one_letter_code']
-#    wt = gene_LF1['mutation'].str.extract('gene_p.(\w{3})').squeeze() # converts to a series that map works on
-    wt = afor_df['mutation'].str.extract(gene_regex).squeeze()
-    afor_df['wild_type'] = wt.map(lookup_dict)   
-    mut = afor_df['mutation'].str.extract('\d+(\w{3})$').squeeze()
-    afor_df['mutant_type'] = mut.map(lookup_dict)
+left_df = afor_df.copy()
+left_df_nrows = len(left_df)
+left_df_ncols = len(left_df.columns)

-# extract position info from mutation column separetly using string match
-afor_df['position'] = afor_df['mutation'].str.extract(r'(\d+)') 
+right_df =  snpinfo_df.copy()
+right_df_nrows = len(right_df)
+right_df_ncols = len(right_df.columns)

-# combine the wild_type+poistion+mutant_type columns to generate 
-# mutationinformation (matches mCSM output field)
-# Remember to use .map(str) for int col types to allow string concatenation
+common_cols  = np.intersect1d(left_df.columns, right_df.columns).tolist()
+print('Length of common cols:', len(common_cols)
+      , '\ncommon column/s:', common_cols, 'type:', type(common_cols))

-afor_df['mutationinformation'] = afor_df['wild_type'] + afor_df['position'].map(str) + afor_df['mutant_type']
-print('Created column: mutationinformation'
-	, '\n====================================================================='
-    ,  afor_df['mutationinformation'].head(10))
+print('selecting consistent dtypes for merging (object i.e string)')
+#https://stackoverflow.com/questions/44639772/python-pandas-column-dtype-object-causing-merge-to-fail-with-dtypewarning-colu
+merging_cols = left_df[common_cols].select_dtypes(include = [object]).columns.tolist()
+print(merging_cols)
+nmerging_cols = len(merging_cols)
+print(' length of merging cols:', nmerging_cols
+      , '\nmerging cols:', merging_cols, 'type:', type(merging_cols))

-# sanity check
-ncols_add = 4 # beware of hardcoding (3 cols for mcsm style mut + 1 for concatenating them all)
-if len(afor_df.columns) == afor_df_ncols + ncols_add:
-    afor_df_ncols = len(afor_df.columns) # update afor_df_ncols after adding cols
-    print('PASS: successfully added', ncols_add, 'cols'
-          , '\nold length:', afor_df_ncols
-          , '\nnew length:', len(afor_df.columns))
-else:
-    print('FAIL: failed to add cols:'
-          , '\nExpected cols:', afor_df_ncols + ncols_add
-          , '\nGot:', len(afor_df.columns))
-    sys.exit()
-#%% Detect mutation format to see if you apply this func
-# FIXME
-#afor_df.iloc[[0]].str.match('pnca_')
-#afor_df.dtypes
+# drop duplicates else the expected rows don't match
+print('Checking for duplicates in common col:', common_cols
+      , '\nNo of duplicates:'
+      , len(right_df[right_df.duplicated(common_cols)])
+      , '\noriginal length:', right_df_nrows)

-#foo = afor_df.loc[:, afor_df.dtypes == object]
-
-genomic_mut_regex = gene_match.lower()+'\w{3}\d+\w{3}'
-print('gene regex being used:', genomic_mut_regex)
-afor_df[(afor_df == genomic_mut_regex).any(axis = 1)]
-
-#%% Finding common col to merge on
-# Define merging column: multiple cols have been used for merge else the common cols 
-# get suffixes '_x'  and '_y' attached
-# also, couldn't include 'position' in merging_cols since data types don't match
-merging_cols = ['wild_type', 'mutant_type', 'mutationinformation']
-ncommon_cols= len(merging_cols)
+right_df = right_df[~right_df.duplicated(common_cols)]
+right_df_nrows = len(right_df)
+print('\nrevised length:', right_df_nrows)

 # checking cross-over of mutations in the two dfs to merge
-ndiff1 = afor_kin_df_nrows - afor_df['mutationinformation'].isin(afor_kin_df['mutationinformation']).sum()
-print(ndiff1)
-ndiff2 = afor_kin_df_nrows - afor_kin_df['mutationinformation'].isin(afor_df['mutationinformation']).sum()
-print(ndiff2)
+ndiff1 = afor_nrows - afor_df['mutation'].isin(snpinfo_df['mutation']).sum()
+print('There are', ndiff1, 'mutations with OR, but no snp_info'
+      , '\nExtracting and writing out file')

-#%% combining dfs
+#afor_df[afor_df['mutation'].isin(snpinfo_df['mutation'])]
+missing_mutinfo = afor_df[~afor_df['mutation'].isin(snpinfo_df['mutation'])]
+#len(missing_mutinfo.duplicated(common_cols))
+
+#missing_mutinfo.to_csv('infoless_muts.csv')
+
+ndiff2 = snpinfo_nrows - snpinfo_df['mutation'].isin(afor_df['mutation']).sum()
+print('There are', ndiff2, 'mutations that do not have OR, but have snp_info')

 # Define join type
 #my_join = 'inner'
+#my_join = 'outer'
 #my_join = 'right'
-#my_join = 'left'
-my_join = 'outer'
+my_join = 'left'
+
+print('combing with join:', my_join)
+combined_df1 = pd.merge(left_df, right_df, on = merging_cols, how = my_join)
+print('nrows:', len(combined_df1)
+      , '\nshape:', combined_df1.shape)
+
+# inner = 252
+left_df_nrows - ndiff1
+
+# outer = 331
+right_df_nrows + ndiff1
+
+# right = 290
+right_df_nrows 
+
+# left = 293
+left_df_nrows
+
+
+#%%
+# see if you want an extra clause here!
+# Define join type
+#my_join = 'inner'
+#my_join = 'outer'
+#my_join = 'right'
+my_join = 'left'
+ 
 fail = False 
-# sanity check: how many muts from afor_kin_df are in afor_df. should be a complete subset
-if ndiff2 == 0:
-    print('PASS: all muts in afor_kin_df are present in afor_df'
-          , '\nProceeding with combining the dfs...')
-    
-    combined_df = pd.merge(afor_df, afor_kin_df, on = merging_cols, how = my_join)
-    
-    if my_join == ('outer' or 'left') :
 print('combing with:', my_join)
-        expected_rows = afor_df_nrows + ndiff1
+combined_df1 = pd.merge(left_df, right_df, on = merging_cols, how = my_join)
     
-    if my_join == ('inner' or 'right'):
-        print('combing with:', my_join)
-        expected_rows = afor_kin_df_nrows
+if my_join == 'inner':
+    #expected_rows = left_df_nrows - ndiff1
+    expected_rows = left_df.shape[0] - ndiff1
    
-    expected_cols = afor_df_ncols + afor_kin_df_ncols - ncommon_cols
+if my_join == 'outer':
+    #expected_rows = right_df_nrows + ndiff1
+    expected_rows = right_df.shape[0] + ndiff1     
    
-    if len(combined_df) == expected_rows and len(combined_df.columns) == expected_cols:
+if my_join == 'right':
+    #expected_rows = right_df_nrows 
+    expected_rows = right_df.shape[0]
+    
+if my_join == 'left':
+    #expected_rows = left_df_nrows
+    expected_rows = left_df.shape[0]
+    
+expected_cols = left_df.shape[1] + right_df.shape[1] - nmerging_cols
+  
+if len(combined_df1) == expected_rows and len(combined_df1.columns) == expected_cols:
    print('PASS: successfully combined dfs with:', my_join, 'join')
 else:
    print('FAIL: combined_df\'s expected rows and cols not matched')
-        fail = True # BAD practice! just a placeholder to avoid code duplication
-    
+    fail = True 
 print('\nExpected no. of rows:', expected_rows
-          , '\nGot:', len(combined_df)
+      , '\nGot:', len(combined_df1)
      , '\nExpected no. of cols:', expected_cols
-          , '\nGot:', len(combined_df.columns))
+      , '\nGot:', len(combined_df1.columns))
 if fail:
-        sys.exit('ERROR: combined_df may be incorrectly combined')
-else:
-    print('FAIL: numbers mismatch, mutations present in afor_kin_df but not in afor_df')
-    sys.exit('ERROR: Not all mutations in the kinship_df are present in the df with other ORs')
-    
-#%% check duplicate cols: ones containing suffix '_x' or '_y'
-# should only be position
-foo = combined_df.filter(regex = r'.*_x|_y', axis = 1)
-print(foo.columns) # should only be position
-
-# drop position col containing suffix '_y' and then rename col without suffix
-combined_or_df = combined_df.drop(combined_df.filter(regex = r'.*_y').columns, axis = 1)
-combined_or_df['position_x'].head()
-
-# renaming columns
-combined_or_df.rename(columns = {'position_x': 'position'}, inplace = True)
-combined_or_df['position'].head()
-
-# recheck
-foo = combined_or_df.filter(regex = r'.*_x|_y', axis = 1)
-print(foo.columns) # should only be empty
-
-#%% rearraging columns
-print('Dim of df prefromatting:', combined_or_df.shape)
-
-print(combined_or_df.columns)
-
-
-#%% reorder columns
-#https://stackoverflow.com/questions/13148429/how-to-change-the-order-of-dataframe-columns
-
-# setting column's order
-output_df = combined_or_df[['mutation',
- 'mutationinformation', 
- 'wild_type', 
- 'position',
- 'mutant_type',
- 'chr_num_allele',
- 'ref_allele',
- 'alt_allele', 
- 'mut_info', 
- 'mut_type', 
- 'gene_id', 
- 'gene_number', 
- 'mut_region', 
- 'reference_allele', 
- 'alternate_allele', 
- 'chromosome_number',
- 'af',
- 'af_kin',
- 'or_kin',
- 'or_logistic', 
- 'or_mychisq', 
- 'est_chisq',
- 'or_fisher',
- 'ci_low_logistic',
- 'ci_hi_logistic',
- 'ci_low_fisher',
- 'ci_hi_fisher',
- 'pwald_kin', 
- 'pval_logistic', 
- 'pval_fisher',
- 'pval_chisq',
- 'beta_logistic',
- 'beta_kin',
- 'se_logistic',
- 'se_kin',
- 'zval_logistic',
- 'logl_H1_kin',
- 'l_remle_kin',
- 'n_diff',
- 'tot_diff',
- 'n_miss']]
-
-# sanity check after rearranging
-
-if combined_or_df.shape == output_df.shape and set(combined_or_df.columns) == set(output_df.columns):
-    print('PASS: Successfully formatted df with rearranged columns')
-else:
-    sys.exit('FAIL: something went wrong when rearranging columns!')
-  
-#%% write file
-print('\n====================================================================='
-      , '\nWriting output file:\n', outfile
-      , '\nNo.of rows:', len(output_df)
-      , '\nNo. of cols:', len(output_df.columns))
-output_df.to_csv(outfile, index = False)    
+    sys.exit()
    
+# update nrows and ncols
+afor_info_nrows = len(afor_info_df)
+afor_info_ncols = len(afor_info_df.columns)
+#%%