updated code and made it tidy

2020-06-25 14:40:44 +01:00 · 2020-06-25 14:40:44 +01:00 · e8a66a7a94
commit e8a66a7a94
parent 7032baa08d
1 changed files with 208 additions and 42 deletions
--- a/scripts/combine_afs_ors.py
+++ b/scripts/combine_afs_ors.py
@ -92,22 +92,22 @@ del(in_filename_afor, in_filename_afor_kin, datadir, indir, outdir)
 # read input csv files to combine
 #========================
 snpinfo_df = pd.read_csv(infile0, sep = ',')
-snpinfo_ncols = len(snpinfo_df.columns) 
-snpinfo_nrows = len(snpinfo_df) 
-print('No. of rows in', infile0, ':', snpinfo_nrows
-      , '\nNo. of cols in', infile0, ':', snpinfo_ncols)
+#snpinfo_ncols = len(snpinfo_df.columns) 
+#snpinfo.shape[0] = len(snpinfo_df) 
+print('No. of rows in', infile0, ':', snpinfo_df.shape[0]
+      , '\nNo. of cols in', infile0, ':', snpinfo_df.shape[1])

 afor_df = pd.read_csv(infile1, sep = ',')
-afor_ncols = len(afor_df.columns) 
-afor_nrows = len(afor_df) 
-print('No. of rows in', infile1, ':', afor_nrows
-      , '\nNo. of cols in', infile1, ':', afor_ncols)
+#afor_ncols = len(afor_df.columns) 
+#afor.shape[0] = len(afor_df) 
+print('No. of rows in', infile1, ':', afor_df.shape[0]
+      , '\nNo. of cols in', infile1, ':', afor_df.shape[1])

 afor_kin_df = pd.read_csv(infile2, sep = ',')
-afor_kin_nrows = len(afor_kin_df)
-afor_kin_ncols = len(afor_kin_df.columns)
-print('No. of rows in', infile2, ':', afor_kin_nrows
-      , '\nNo. of cols in', infile2, ':', afor_kin_ncols)
+#afor_kin.shape[0] = len(afor_kin_df)
+#afor_kin_ncols = len(afor_kin_df.columns)
+print('No. of rows in', infile2, ':', afor_kin_df.shape[0]
+      , '\nNo. of cols in', infile2, ':', afor_kin_df.shape[1])

 #%% Process afor_df
 #1) pull all snp_info so you have ref_allele, etc
@ -115,19 +115,14 @@ print('No. of rows in', infile2, ':', afor_kin_nrows
 # find merging column

 left_df = afor_df.copy()
-left_df_nrows = len(left_df)
-left_df_ncols = len(left_df.columns)
-
 right_df =  snpinfo_df.copy()
-right_df_nrows = len(right_df)
-right_df_ncols = len(right_df.columns)

 common_cols  = np.intersect1d(left_df.columns, right_df.columns).tolist()
 print('Length of common cols:', len(common_cols)
      , '\ncommon column/s:', common_cols, 'type:', type(common_cols))

-print('selecting consistent dtypes for merging (object i.e string)')
 #https://stackoverflow.com/questions/44639772/python-pandas-column-dtype-object-causing-merge-to-fail-with-dtypewarning-colu
+print('selecting consistent dtypes for merging (object i.e string)')
 merging_cols = left_df[common_cols].select_dtypes(include = [object]).columns.tolist()
 print(merging_cols)
 nmerging_cols = len(merging_cols)
@ -138,24 +133,19 @@ print(' length of merging cols:', nmerging_cols
 print('Checking for duplicates in common col:', common_cols
      , '\nNo of duplicates:'
      , len(right_df[right_df.duplicated(common_cols)])
-      , '\noriginal length:', right_df_nrows)
+      , '\noriginal length:', right_df.shape[0])

 right_df = right_df[~right_df.duplicated(common_cols)]
-right_df_nrows = len(right_df)
-print('\nrevised length:', right_df_nrows)
+print('\nrevised length:', right_df.shape[0])

 # checking cross-over of mutations in the two dfs to merge
-ndiff1 = afor_nrows - afor_df['mutation'].isin(snpinfo_df['mutation']).sum()
+ndiff1 = left_df.shape[0] - left_df['mutation'].isin(right_df['mutation']).sum()
 print('There are', ndiff1, 'mutations with OR, but no snp_info'
      , '\nExtracting and writing out file')
-
-#afor_df[afor_df['mutation'].isin(snpinfo_df['mutation'])]
-missing_mutinfo = afor_df[~afor_df['mutation'].isin(snpinfo_df['mutation'])]
-#len(missing_mutinfo.duplicated(common_cols))
-
+missing_mutinfo = left_df[~left_df['mutation'].isin(right_df['mutation'])]
 #missing_mutinfo.to_csv('infoless_muts.csv')

-ndiff2 = snpinfo_nrows - snpinfo_df['mutation'].isin(afor_df['mutation']).sum()
+ndiff2 = right_df.shape[0] - right_df['mutation'].isin(left_df['mutation']).sum()
 print('There are', ndiff2, 'mutations that do not have OR, but have snp_info')

 # Define join type
@ -166,20 +156,19 @@ my_join = 'left'

 print('combing with join:', my_join)
 combined_df1 = pd.merge(left_df, right_df, on = merging_cols, how = my_join)
-print('nrows:', len(combined_df1)
-      , '\nshape:', combined_df1.shape)
+print('\nshape:', combined_df1.shape)

 # inner = 252
-left_df_nrows - ndiff1
+left_df.shape[0] - ndiff1

 # outer = 331
-right_df_nrows + ndiff1
+right_df.shape[0] + ndiff1

 # right = 290
-right_df_nrows 
+right_df.shape[0] 

 # left = 293
-left_df_nrows
+left_df.shape[0]


 #%%
@ -195,19 +184,19 @@ print('combing with:', my_join)
 combined_df1 = pd.merge(left_df, right_df, on = merging_cols, how = my_join)
     
 if my_join == 'inner':
-    #expected_rows = left_df_nrows - ndiff1
+    #expected_rows = left_df.shape[0] - ndiff1
    expected_rows = left_df.shape[0] - ndiff1
    
 if my_join == 'outer':
-    #expected_rows = right_df_nrows + ndiff1
+    #expected_rows = right_df.shape[0] + ndiff1
    expected_rows = right_df.shape[0] + ndiff1     
    
 if my_join == 'right':
-    #expected_rows = right_df_nrows 
+    #expected_rows = right_df.shape[0] 
    expected_rows = right_df.shape[0]
    
 if my_join == 'left':
-    #expected_rows = left_df_nrows
+    #expected_rows = left_df.shape[0]
    expected_rows = left_df.shape[0]
    
 expected_cols = left_df.shape[1] + right_df.shape[1] - nmerging_cols
@ -224,9 +213,186 @@ print('\nExpected no. of rows:', expected_rows
 if fail:
    sys.exit()
    
-# update nrows and ncols
-afor_info_nrows = len(afor_info_df)
-afor_info_ncols = len(afor_info_df.columns)
-#%%
+# delete variables
+del(left_df, right_df, common_cols, merging_cols, nmerging_cols, my_join, ndiff1, ndiff2, missing_mutinfo
+    , expected_rows, expected_cols, fail)
+del(afor_df, snpinfo_df)


+
+
+
+
+
+
+#%% Second merge: combined_df1 and afor_kin_df
+
+left_df = combined_df1.copy()
+right_df = afor_kin_df.copy()
+
+common_cols  = np.intersect1d(left_df.columns, right_df.columns).tolist()
+print('Length of common cols:', len(common_cols)
+      , '\ncommon column/s:', common_cols, 'type:', type(common_cols))
+
+#https://stackoverflow.com/questions/44639772/python-pandas-column-dtype-object-causing-merge-to-fail-with-dtypewarning-colu
+print('selecting consistent dtypes for merging (object i.e string)')
+
+#FIXME
+
+#merging_cols = left_df[common_cols].select_dtypes(include = [object]).columns.tolist()
+merging_cols = ['wild_type', 'mutant_type', 'mutationinformation']
+nmerging_cols_cols = len(merging_cols)                  
+         
+print(merging_cols)
+nmerging_cols = len(merging_cols)
+print(' length of merging cols:', nmerging_cols
+      , '\nmerging cols:', merging_cols, 'type:', type(merging_cols))
+
+ndiff1 = left_df.shape[0] - left_df['mutationinformation'].isin(right_df['mutationinformation']).sum()
+print('There are', ndiff1, 'mutations with OR, but not in OR kinship'
+      , '\nExtracting and writing out file')
+missing_mutinfo = left_df[~left_df['mutationinformation'].isin(right_df['mutationinformation'])]
+#missing_mutinfo.to_csv('infoless_muts.csv')
+
+ndiff2 = right_df.shape[0] - right_df['mutationinformation'].isin(left_df['mutationinformation']).sum()
+print('There are', ndiff2, 'mutations that do not have OR, but have OR kinship')
+
+my_join = 'outer'
+ 
+fail = False 
+print('combing with:', my_join)
+combined_df2 = pd.merge(left_df, right_df, on = merging_cols, how = my_join)
+     
+if my_join == 'inner':
+    #expected_rows = left_df.shape[0] - ndiff1
+    expected_rows = left_df.shape[0] - ndiff1
+    
+if my_join == 'outer':
+    #expected_rows = right_df.shape[0] + ndiff1
+    expected_rows = right_df.shape[0] + ndiff1     
+    
+if my_join == 'right':
+    #expected_rows = right_df.shape[0] 
+    expected_rows = right_df.shape[0]
+    
+if my_join == 'left':
+    #expected_rows = left_df.shape[0]
+    expected_rows = left_df.shape[0]
+    
+expected_cols = left_df.shape[1] + right_df.shape[1] - nmerging_cols
+  
+if len(combined_df2) == expected_rows and len(combined_df2.columns) == expected_cols:
+    print('PASS: successfully combined dfs with:', my_join, 'join')
+else:
+    print('FAIL: combined_df\'s expected rows and cols not matched')
+    fail = True 
+print('\nExpected no. of rows:', expected_rows
+      , '\nGot:', len(combined_df2)
+      , '\nExpected no. of cols:', expected_cols
+      , '\nGot:', len(combined_df2.columns))
+if fail:
+    sys.exit()
+#%% check duplicate cols: ones containing suffix '_x' or '_y'
+# should only be position
+foo = combined_df2.filter(regex = r'.*_x|_y', axis = 1)
+print(foo.columns) # should only be position
+
+# drop position col containing suffix '_y' and then rename col without suffix
+combined_or_df = combined_df2.drop(combined_df2.filter(regex = r'.*_y').columns, axis = 1)
+#combined_or_df['position_x'].head()
+
+# renaming columns
+#combined_or_df.rename(columns = {'position_x': 'position'}, inplace = True)
+#combined_or_df['position'].head()
+#recheck
+#foo = combined_or_df.filter(regex = r'.*_x|_y', axis = 1)
+#print(foo.columns) # should only be empty
+
+
+# remove '_x' from some cols
+
+import re
+def clean_colnames(colname):
+    
+    if re.search('.*_x', colname):
+        pos = re.search('.*_x', colname).start()
+        return colname[:pos]
+    else:
+        return colname
+    
+#https://stackoverflow.com/questions/26500156/renaming-column-in-dataframe-for-pandas-using-regular-expression
+combined_or_df.columns
+combined_or_df.rename(columns=lambda x: re.sub('_x$','',x), inplace = True)
+combined_or_df.columns
+
+#FIXME: this should be 0 when you run the 35k dataset
+combined_or_df['chromosome_number'].isna().sum()
+
+#%% rearraging columns
+print('Dim of df prefromatting:', combined_or_df.shape)
+
+print(combined_or_df.columns, '\nshape:', combined_or_df.shape)
+
+# removing unnecessary column
+combined_or_df = combined_or_df.drop(['symbol'], axis = 1)
+print(combined_or_df.columns, '\nshape:', combined_or_df.shape)
+#%% reorder columns
+#https://stackoverflow.com/questions/13148429/how-to-change-the-order-of-dataframe-columns
+# setting column's order
+output_df = combined_or_df[['mutation',
+ 'mutationinformation', 
+ 'wild_type', 
+ 'position',
+ 'mutant_type',
+ 'chr_num_allele',
+ 'ref_allele',
+ 'alt_allele', 
+ 'mut_info', 
+ 'mut_type', 
+ 'gene_id', 
+ 'gene_number', 
+ 'mut_region', 
+ 'reference_allele', 
+ 'alternate_allele', 
+ 'chromosome_number',
+ 'af',
+ 'af_kin',
+ 'or_kin',
+ 'or_logistic', 
+ 'or_mychisq', 
+ 'est_chisq',
+ 'or_fisher',
+ 'ci_low_logistic',
+ 'ci_hi_logistic',
+ 'ci_low_fisher',
+ 'ci_hi_fisher',
+ 'pwald_kin', 
+ 'pval_logistic', 
+ 'pval_fisher',
+ 'pval_chisq',
+ 'beta_logistic',
+ 'beta_kin',
+ 'se_logistic',
+ 'se_kin',
+ 'zval_logistic',
+ 'logl_H1_kin',
+ 'l_remle_kin',
+ 'wt_3let',
+ 'mt_3let',
+ 'n_diff',
+ 'tot_diff',
+ 'n_miss']]
+
+# sanity check after rearranging
+if combined_or_df.shape == output_df.shape and set(combined_or_df.columns) == set(output_df.columns):
+    print('PASS: Successfully formatted df with rearranged columns')
+else:
+    sys.exit('FAIL: something went wrong when rearranging columns!')
+  
+#%% write file
+print('\n====================================================================='
+      , '\nWriting output file:\n', outfile
+      , '\nNo.of rows:', len(output_df)
+      , '\nNo. of cols:', len(output_df.columns))
+output_df.to_csv(outfile, index = False)    
+