tidying script

2020-06-25 13:12:09 +01:00 · 2020-06-25 13:12:09 +01:00 · 7032baa08d
commit 7032baa08d
parent cdb1ea1476
1 changed files with 133 additions and 197 deletions
--- a/scripts/combine_afs_ors.py
+++ b/scripts/combine_afs_ors.py
@ -30,41 +30,46 @@ os.chdir(homedir + '/git/LSHTM_analysis/scripts')
 os.getcwd()
 # local import
-from reference_dict import my_aa_dict # CHECK DIR STRUC THERE!
+#from reference_dict import my_aa_dict # CHECK DIR STRUC THERE!
 from reference_dict import low_3letter_dict
 #=======================================================================
 #%% command line args
-arg_parser = argparse.ArgumentParser()
+#arg_parser = argparse.ArgumentParser()
-arg_parser.add_argument('-d', '--drug', help = 'drug name', default = 'pyrazinamide')
+#arg_parser.add_argument('-d', '--drug', help = 'drug name', default = 'pyrazinamide')
-arg_parser.add_argument('-g', '--gene', help = 'gene name', default = 'pncA') # case sensitive
+#arg_parser.add_argument('-g', '--gene', help = 'gene name', default = 'pncA') # case sensitive
-args = arg_parser.parse_args()
+#args = arg_parser.parse_args()
 #=======================================================================
 #%% variable assignment: input and output 
-#drug = 'pyrazinamide'
+drug = 'pyrazinamide'
-#gene = 'pncA'
+gene = 'pncA'
-#gene_match = gene + '_p.'
+gene_match = gene + '_p.'
 # cmd variables
-drug = args.drug
+#drug = args.drug
-gene = args.gene
+#gene = args.gene
-gene_match = gene + '_p.'
+#gene_match = gene + '_p.'
 #==========
 # dir
 #==========
 datadir = homedir + '/' + 'git/Data'
 indir = datadir + '/' + drug + '/' + 'input'
 outdir = datadir + '/' + drug + '/' + 'output'
 #=======
 # input
 #=======
 in_filename_snpinfo = 'ns' + gene.lower() + '_snp_info.csv'
 in_filename_afor = gene.lower() + '_af_or.csv'
 in_filename_afor_kin = gene.lower() + '_af_or_kinship.csv'
 infile0 = indir + '/' + in_filename_snpinfo 
 infile1 = outdir + '/' + in_filename_afor
 infile2 = outdir + '/' + in_filename_afor_kin
-print('Input file1:', infile1
+print('Input file0:', infile0
      , '\nInput file1:', infile1
      , '\nInput file2:', infile2
      , '\n===================================================================')
@ -77,7 +82,7 @@ print('Output file:', outfile
      , '\n===================================================================')
-del(in_filename_afor, in_filename_afor_kin, datadir, outdir)
+del(in_filename_afor, in_filename_afor_kin, datadir, indir, outdir)
 #%% end of variable assignment for input and output files
 #=======================================================================
 #%% format mutations
@ -86,211 +91,142 @@ del(in_filename_afor, in_filename_afor_kin, datadir, outdir)
 #========================
 # read input csv files to combine
 #========================
 snpinfo_df = pd.read_csv(infile0, sep = ',')
 snpinfo_ncols = len(snpinfo_df.columns) 
 snpinfo_nrows = len(snpinfo_df) 
 print('No. of rows in', infile0, ':', snpinfo_nrows
      , '\nNo. of cols in', infile0, ':', snpinfo_ncols)
 afor_df = pd.read_csv(infile1, sep = ',')
-afor_df_ncols = len(afor_df.columns) 
+afor_ncols = len(afor_df.columns) 
-afor_df_nrows = len(afor_df) 
+afor_nrows = len(afor_df) 
-print('No. of rows in', infile1, ':', afor_df_nrows
+print('No. of rows in', infile1, ':', afor_nrows
-      , '\nNo. of cols in', infile1, ':', afor_df_ncols)
+      , '\nNo. of cols in', infile1, ':', afor_ncols)
 afor_kin_df = pd.read_csv(infile2, sep = ',')
-afor_kin_df_nrows = len(afor_kin_df)
+afor_kin_nrows = len(afor_kin_df)
-afor_kin_df_ncols = len(afor_kin_df.columns)
+afor_kin_ncols = len(afor_kin_df.columns)
-print('No. of rows in', infile2, ':', afor_kin_df_nrows
+print('No. of rows in', infile2, ':', afor_kin_nrows
-      , '\nNo. of cols in', infile2, ':', afor_kin_df_ncols)
+      , '\nNo. of cols in', infile2, ':', afor_kin_ncols)
-#=======
+#%% Process afor_df
-# Iterate through the dict, create a lookup dict i.e
+#1) pull all snp_info so you have ref_allele, etc
-# lookup_dict = {three_letter_code: one_letter_code}.
+# i.e merge afor_df and snpinfo_df
-# lookup dict should be the key and the value (you want to create a column for)
+# find merging column
 # Then use this to perform the mapping separetly for wild type and mutant cols.
 # The three letter code is extracted using a string match match from the dataframe and then converted
 # to 'pandas series'since map only works in pandas series
 #=======
 gene_regex = gene_match.lower()+'(\w{3})'
 print('gene regex being used:', gene_regex)
-# initialise a sub dict that is lookup dict for three letter code to 1-letter code
+left_df = afor_df.copy()
-# adding three more cols
+left_df_nrows = len(left_df)
-lookup_dict = dict()
+left_df_ncols = len(left_df.columns)
 for k, v in my_aa_dict.items():
    lookup_dict[k] = v['one_letter_code']
 #    wt = gene_LF1['mutation'].str.extract('gene_p.(\w{3})').squeeze() # converts to a series that map works on
    wt = afor_df['mutation'].str.extract(gene_regex).squeeze()
    afor_df['wild_type'] = wt.map(lookup_dict)   
    mut = afor_df['mutation'].str.extract('\d+(\w{3})$').squeeze()
    afor_df['mutant_type'] = mut.map(lookup_dict)
-# extract position info from mutation column separetly using string match
+right_df =  snpinfo_df.copy()
-afor_df['position'] = afor_df['mutation'].str.extract(r'(\d+)') 
+right_df_nrows = len(right_df)
 right_df_ncols = len(right_df.columns)
-# combine the wild_type+poistion+mutant_type columns to generate 
+common_cols  = np.intersect1d(left_df.columns, right_df.columns).tolist()
-# mutationinformation (matches mCSM output field)
+print('Length of common cols:', len(common_cols)
-# Remember to use .map(str) for int col types to allow string concatenation
+      , '\ncommon column/s:', common_cols, 'type:', type(common_cols))
-afor_df['mutationinformation'] = afor_df['wild_type'] + afor_df['position'].map(str) + afor_df['mutant_type']
+print('selecting consistent dtypes for merging (object i.e string)')
-print('Created column: mutationinformation'
+#https://stackoverflow.com/questions/44639772/python-pandas-column-dtype-object-causing-merge-to-fail-with-dtypewarning-colu
-	, '\n====================================================================='
+merging_cols = left_df[common_cols].select_dtypes(include = [object]).columns.tolist()
-    ,  afor_df['mutationinformation'].head(10))
+print(merging_cols)
 nmerging_cols = len(merging_cols)
 print(' length of merging cols:', nmerging_cols
      , '\nmerging cols:', merging_cols, 'type:', type(merging_cols))
-# sanity check
+# drop duplicates else the expected rows don't match
-ncols_add = 4 # beware of hardcoding (3 cols for mcsm style mut + 1 for concatenating them all)
+print('Checking for duplicates in common col:', common_cols
-if len(afor_df.columns) == afor_df_ncols + ncols_add:
+      , '\nNo of duplicates:'
-    afor_df_ncols = len(afor_df.columns) # update afor_df_ncols after adding cols
+      , len(right_df[right_df.duplicated(common_cols)])
-    print('PASS: successfully added', ncols_add, 'cols'
+      , '\noriginal length:', right_df_nrows)
          , '\nold length:', afor_df_ncols
          , '\nnew length:', len(afor_df.columns))
 else:
    print('FAIL: failed to add cols:'
          , '\nExpected cols:', afor_df_ncols + ncols_add
          , '\nGot:', len(afor_df.columns))
    sys.exit()
 #%% Detect mutation format to see if you apply this func
 # FIXME
 #afor_df.iloc[[0]].str.match('pnca_')
 #afor_df.dtypes
-#foo = afor_df.loc[:, afor_df.dtypes == object]
+right_df = right_df[~right_df.duplicated(common_cols)]
-
+right_df_nrows = len(right_df)
-genomic_mut_regex = gene_match.lower()+'\w{3}\d+\w{3}'
+print('\nrevised length:', right_df_nrows)
 print('gene regex being used:', genomic_mut_regex)
 afor_df[(afor_df == genomic_mut_regex).any(axis = 1)]
 #%% Finding common col to merge on
 # Define merging column: multiple cols have been used for merge else the common cols 
 # get suffixes '_x'  and '_y' attached
 # also, couldn't include 'position' in merging_cols since data types don't match
 merging_cols = ['wild_type', 'mutant_type', 'mutationinformation']
 ncommon_cols= len(merging_cols)
 # checking cross-over of mutations in the two dfs to merge
-ndiff1 = afor_kin_df_nrows - afor_df['mutationinformation'].isin(afor_kin_df['mutationinformation']).sum()
+ndiff1 = afor_nrows - afor_df['mutation'].isin(snpinfo_df['mutation']).sum()
-print(ndiff1)
+print('There are', ndiff1, 'mutations with OR, but no snp_info'
-ndiff2 = afor_kin_df_nrows - afor_kin_df['mutationinformation'].isin(afor_df['mutationinformation']).sum()
+      , '\nExtracting and writing out file')
 print(ndiff2)
-#%% combining dfs
+#afor_df[afor_df['mutation'].isin(snpinfo_df['mutation'])]
 missing_mutinfo = afor_df[~afor_df['mutation'].isin(snpinfo_df['mutation'])]
 #len(missing_mutinfo.duplicated(common_cols))
 #missing_mutinfo.to_csv('infoless_muts.csv')
 ndiff2 = snpinfo_nrows - snpinfo_df['mutation'].isin(afor_df['mutation']).sum()
 print('There are', ndiff2, 'mutations that do not have OR, but have snp_info')
 # Define join type
 #my_join = 'inner'
 #my_join = 'outer'
 #my_join = 'right'
-#my_join = 'left'
+my_join = 'left'
-my_join = 'outer'
+
-fail = False
+print('combing with join:', my_join)
-# sanity check: how many muts from afor_kin_df are in afor_df. should be a complete subset
+combined_df1 = pd.merge(left_df, right_df, on = merging_cols, how = my_join)
-if ndiff2 == 0:
+print('nrows:', len(combined_df1)
-    print('PASS: all muts in afor_kin_df are present in afor_df'
+      , '\nshape:', combined_df1.shape)
-          , '\nProceeding with combining the dfs...')
+
 # inner = 252
 left_df_nrows - ndiff1
 # outer = 331
 right_df_nrows + ndiff1
 # right = 290
 right_df_nrows 
 # left = 293
 left_df_nrows
 #%%
 # see if you want an extra clause here!
 # Define join type
 #my_join = 'inner'
 #my_join = 'outer'
 #my_join = 'right'
 my_join = 'left'
 fail = False 
 print('combing with:', my_join)
 combined_df1 = pd.merge(left_df, right_df, on = merging_cols, how = my_join)
 if my_join == 'inner':
    #expected_rows = left_df_nrows - ndiff1
    expected_rows = left_df.shape[0] - ndiff1
-    combined_df = pd.merge(afor_df, afor_kin_df, on = merging_cols, how = my_join)
+if my_join == 'outer':
    #expected_rows = right_df_nrows + ndiff1
    expected_rows = right_df.shape[0] + ndiff1     
-    if my_join == ('outer' or 'left') :
+if my_join == 'right':
-        print('combing with:', my_join)
+    #expected_rows = right_df_nrows 
-        expected_rows = afor_df_nrows + ndiff1
+    expected_rows = right_df.shape[0]
-        
+    
-    if my_join == ('inner' or 'right'):
+if my_join == 'left':
-        print('combing with:', my_join)
+    #expected_rows = left_df_nrows
-        expected_rows = afor_kin_df_nrows
+    expected_rows = left_df.shape[0]
-        
+    
-    expected_cols = afor_df_ncols + afor_kin_df_ncols - ncommon_cols
+expected_cols = left_df.shape[1] + right_df.shape[1] - nmerging_cols
-    if len(combined_df) == expected_rows and len(combined_df.columns) == expected_cols:
+if len(combined_df1) == expected_rows and len(combined_df1.columns) == expected_cols:
-        print('PASS: successfully combined dfs with:', my_join, 'join')
+    print('PASS: successfully combined dfs with:', my_join, 'join')
    else:
        print('FAIL: combined_df\'s expected rows and cols not matched')
        fail = True # BAD practice! just a placeholder to avoid code duplication
    print('\nExpected no. of rows:', expected_rows
          , '\nGot:', len(combined_df)
          , '\nExpected no. of cols:', expected_cols
          , '\nGot:', len(combined_df.columns))
    if fail:
        sys.exit('ERROR: combined_df may be incorrectly combined')
 else:
-    print('FAIL: numbers mismatch, mutations present in afor_kin_df but not in afor_df')
+    print('FAIL: combined_df\'s expected rows and cols not matched')
-    sys.exit('ERROR: Not all mutations in the kinship_df are present in the df with other ORs')
+    fail = True 
 print('\nExpected no. of rows:', expected_rows
      , '\nGot:', len(combined_df1)
      , '\nExpected no. of cols:', expected_cols
      , '\nGot:', len(combined_df1.columns))
 if fail:
    sys.exit()
-#%% check duplicate cols: ones containing suffix '_x' or '_y'
+# update nrows and ncols
-# should only be position
+afor_info_nrows = len(afor_info_df)
-foo = combined_df.filter(regex = r'.*_x|_y', axis = 1)
+afor_info_ncols = len(afor_info_df.columns)
-print(foo.columns) # should only be position
+#%%
 # drop position col containing suffix '_y' and then rename col without suffix
 combined_or_df = combined_df.drop(combined_df.filter(regex = r'.*_y').columns, axis = 1)
 combined_or_df['position_x'].head()
 # renaming columns
 combined_or_df.rename(columns = {'position_x': 'position'}, inplace = True)
 combined_or_df['position'].head()
 # recheck
 foo = combined_or_df.filter(regex = r'.*_x|_y', axis = 1)
 print(foo.columns) # should only be empty
 #%% rearraging columns
 print('Dim of df prefromatting:', combined_or_df.shape)
 print(combined_or_df.columns)
 #%% reorder columns
 #https://stackoverflow.com/questions/13148429/how-to-change-the-order-of-dataframe-columns
 # setting column's order
 output_df = combined_or_df[['mutation',
 'mutationinformation', 
 'wild_type', 
 'position',
 'mutant_type',
 'chr_num_allele',
 'ref_allele',
 'alt_allele', 
 'mut_info', 
 'mut_type', 
 'gene_id', 
 'gene_number', 
 'mut_region', 
 'reference_allele', 
 'alternate_allele', 
 'chromosome_number',
 'af',
 'af_kin',
 'or_kin',
 'or_logistic', 
 'or_mychisq', 
 'est_chisq',
 'or_fisher',
 'ci_low_logistic',
 'ci_hi_logistic',
 'ci_low_fisher',
 'ci_hi_fisher',
 'pwald_kin', 
 'pval_logistic', 
 'pval_fisher',
 'pval_chisq',
 'beta_logistic',
 'beta_kin',
 'se_logistic',
 'se_kin',
 'zval_logistic',
 'logl_H1_kin',
 'l_remle_kin',
 'n_diff',
 'tot_diff',
 'n_miss']]
 # sanity check after rearranging
 if combined_or_df.shape == output_df.shape and set(combined_or_df.columns) == set(output_df.columns):
    print('PASS: Successfully formatted df with rearranged columns')
 else:
    sys.exit('FAIL: something went wrong when rearranging columns!')
 #%% write file
 print('\n====================================================================='
      , '\nWriting output file:\n', outfile
      , '\nNo.of rows:', len(output_df)
      , '\nNo. of cols:', len(output_df.columns))
 output_df.to_csv(outfile, index = False)