updated code and made it tidy

2020-06-25 14:40:44 +01:00 · 2020-06-25 14:40:44 +01:00 · af65a86ff9
commit af65a86ff9
parent 3c6122a296
1 changed files with 208 additions and 42 deletions
--- a/scripts/combine_afs_ors.py
+++ b/scripts/combine_afs_ors.py
@ -92,22 +92,22 @@ del(in_filename_afor, in_filename_afor_kin, datadir, indir, outdir)
 # read input csv files to combine
 #========================
 snpinfo_df = pd.read_csv(infile0, sep = ',')
-snpinfo_ncols = len(snpinfo_df.columns) 
+#snpinfo_ncols = len(snpinfo_df.columns) 
-snpinfo_nrows = len(snpinfo_df) 
+#snpinfo.shape[0] = len(snpinfo_df) 
-print('No. of rows in', infile0, ':', snpinfo_nrows
+print('No. of rows in', infile0, ':', snpinfo_df.shape[0]
-      , '\nNo. of cols in', infile0, ':', snpinfo_ncols)
+      , '\nNo. of cols in', infile0, ':', snpinfo_df.shape[1])
 afor_df = pd.read_csv(infile1, sep = ',')
-afor_ncols = len(afor_df.columns) 
+#afor_ncols = len(afor_df.columns) 
-afor_nrows = len(afor_df) 
+#afor.shape[0] = len(afor_df) 
-print('No. of rows in', infile1, ':', afor_nrows
+print('No. of rows in', infile1, ':', afor_df.shape[0]
-      , '\nNo. of cols in', infile1, ':', afor_ncols)
+      , '\nNo. of cols in', infile1, ':', afor_df.shape[1])
 afor_kin_df = pd.read_csv(infile2, sep = ',')
-afor_kin_nrows = len(afor_kin_df)
+#afor_kin.shape[0] = len(afor_kin_df)
-afor_kin_ncols = len(afor_kin_df.columns)
+#afor_kin_ncols = len(afor_kin_df.columns)
-print('No. of rows in', infile2, ':', afor_kin_nrows
+print('No. of rows in', infile2, ':', afor_kin_df.shape[0]
-      , '\nNo. of cols in', infile2, ':', afor_kin_ncols)
+      , '\nNo. of cols in', infile2, ':', afor_kin_df.shape[1])
 #%% Process afor_df
 #1) pull all snp_info so you have ref_allele, etc
@ -115,19 +115,14 @@ print('No. of rows in', infile2, ':', afor_kin_nrows
 # find merging column
 left_df = afor_df.copy()
 left_df_nrows = len(left_df)
 left_df_ncols = len(left_df.columns)
 right_df =  snpinfo_df.copy()
 right_df_nrows = len(right_df)
 right_df_ncols = len(right_df.columns)
 common_cols  = np.intersect1d(left_df.columns, right_df.columns).tolist()
 print('Length of common cols:', len(common_cols)
      , '\ncommon column/s:', common_cols, 'type:', type(common_cols))
 print('selecting consistent dtypes for merging (object i.e string)')
 #https://stackoverflow.com/questions/44639772/python-pandas-column-dtype-object-causing-merge-to-fail-with-dtypewarning-colu
 print('selecting consistent dtypes for merging (object i.e string)')
 merging_cols = left_df[common_cols].select_dtypes(include = [object]).columns.tolist()
 print(merging_cols)
 nmerging_cols = len(merging_cols)
@ -138,24 +133,19 @@ print(' length of merging cols:', nmerging_cols
 print('Checking for duplicates in common col:', common_cols
      , '\nNo of duplicates:'
      , len(right_df[right_df.duplicated(common_cols)])
-      , '\noriginal length:', right_df_nrows)
+      , '\noriginal length:', right_df.shape[0])
 right_df = right_df[~right_df.duplicated(common_cols)]
-right_df_nrows = len(right_df)
+print('\nrevised length:', right_df.shape[0])
 print('\nrevised length:', right_df_nrows)
 # checking cross-over of mutations in the two dfs to merge
-ndiff1 = afor_nrows - afor_df['mutation'].isin(snpinfo_df['mutation']).sum()
+ndiff1 = left_df.shape[0] - left_df['mutation'].isin(right_df['mutation']).sum()
 print('There are', ndiff1, 'mutations with OR, but no snp_info'
      , '\nExtracting and writing out file')
-
+missing_mutinfo = left_df[~left_df['mutation'].isin(right_df['mutation'])]
 #afor_df[afor_df['mutation'].isin(snpinfo_df['mutation'])]
 missing_mutinfo = afor_df[~afor_df['mutation'].isin(snpinfo_df['mutation'])]
 #len(missing_mutinfo.duplicated(common_cols))
 #missing_mutinfo.to_csv('infoless_muts.csv')
-ndiff2 = snpinfo_nrows - snpinfo_df['mutation'].isin(afor_df['mutation']).sum()
+ndiff2 = right_df.shape[0] - right_df['mutation'].isin(left_df['mutation']).sum()
 print('There are', ndiff2, 'mutations that do not have OR, but have snp_info')
 # Define join type
@ -166,20 +156,19 @@ my_join = 'left'
 print('combing with join:', my_join)
 combined_df1 = pd.merge(left_df, right_df, on = merging_cols, how = my_join)
-print('nrows:', len(combined_df1)
+print('\nshape:', combined_df1.shape)
      , '\nshape:', combined_df1.shape)
 # inner = 252
-left_df_nrows - ndiff1
+left_df.shape[0] - ndiff1
 # outer = 331
-right_df_nrows + ndiff1
+right_df.shape[0] + ndiff1
 # right = 290
-right_df_nrows 
+right_df.shape[0] 
 # left = 293
-left_df_nrows
+left_df.shape[0]
 #%%
@ -195,19 +184,19 @@ print('combing with:', my_join)
 combined_df1 = pd.merge(left_df, right_df, on = merging_cols, how = my_join)
 if my_join == 'inner':
-    #expected_rows = left_df_nrows - ndiff1
+    #expected_rows = left_df.shape[0] - ndiff1
    expected_rows = left_df.shape[0] - ndiff1
 if my_join == 'outer':
-    #expected_rows = right_df_nrows + ndiff1
+    #expected_rows = right_df.shape[0] + ndiff1
    expected_rows = right_df.shape[0] + ndiff1     
 if my_join == 'right':
-    #expected_rows = right_df_nrows 
+    #expected_rows = right_df.shape[0] 
    expected_rows = right_df.shape[0]
 if my_join == 'left':
-    #expected_rows = left_df_nrows
+    #expected_rows = left_df.shape[0]
    expected_rows = left_df.shape[0]
 expected_cols = left_df.shape[1] + right_df.shape[1] - nmerging_cols
@ -224,9 +213,186 @@ print('\nExpected no. of rows:', expected_rows
 if fail:
    sys.exit()
-# update nrows and ncols
+# delete variables
-afor_info_nrows = len(afor_info_df)
+del(left_df, right_df, common_cols, merging_cols, nmerging_cols, my_join, ndiff1, ndiff2, missing_mutinfo
-afor_info_ncols = len(afor_info_df.columns)
+    , expected_rows, expected_cols, fail)
-#%%
+del(afor_df, snpinfo_df)
 #%% Second merge: combined_df1 and afor_kin_df
 left_df = combined_df1.copy()
 right_df = afor_kin_df.copy()
 common_cols  = np.intersect1d(left_df.columns, right_df.columns).tolist()
 print('Length of common cols:', len(common_cols)
      , '\ncommon column/s:', common_cols, 'type:', type(common_cols))
 #https://stackoverflow.com/questions/44639772/python-pandas-column-dtype-object-causing-merge-to-fail-with-dtypewarning-colu
 print('selecting consistent dtypes for merging (object i.e string)')
 #FIXME
 #merging_cols = left_df[common_cols].select_dtypes(include = [object]).columns.tolist()
 merging_cols = ['wild_type', 'mutant_type', 'mutationinformation']
 nmerging_cols_cols = len(merging_cols)                  
 print(merging_cols)
 nmerging_cols = len(merging_cols)
 print(' length of merging cols:', nmerging_cols
      , '\nmerging cols:', merging_cols, 'type:', type(merging_cols))
 ndiff1 = left_df.shape[0] - left_df['mutationinformation'].isin(right_df['mutationinformation']).sum()
 print('There are', ndiff1, 'mutations with OR, but not in OR kinship'
      , '\nExtracting and writing out file')
 missing_mutinfo = left_df[~left_df['mutationinformation'].isin(right_df['mutationinformation'])]
 #missing_mutinfo.to_csv('infoless_muts.csv')
 ndiff2 = right_df.shape[0] - right_df['mutationinformation'].isin(left_df['mutationinformation']).sum()
 print('There are', ndiff2, 'mutations that do not have OR, but have OR kinship')
 my_join = 'outer'
 fail = False 
 print('combing with:', my_join)
 combined_df2 = pd.merge(left_df, right_df, on = merging_cols, how = my_join)
 if my_join == 'inner':
    #expected_rows = left_df.shape[0] - ndiff1
    expected_rows = left_df.shape[0] - ndiff1
 if my_join == 'outer':
    #expected_rows = right_df.shape[0] + ndiff1
    expected_rows = right_df.shape[0] + ndiff1     
 if my_join == 'right':
    #expected_rows = right_df.shape[0] 
    expected_rows = right_df.shape[0]
 if my_join == 'left':
    #expected_rows = left_df.shape[0]
    expected_rows = left_df.shape[0]
 expected_cols = left_df.shape[1] + right_df.shape[1] - nmerging_cols
 if len(combined_df2) == expected_rows and len(combined_df2.columns) == expected_cols:
    print('PASS: successfully combined dfs with:', my_join, 'join')
 else:
    print('FAIL: combined_df\'s expected rows and cols not matched')
    fail = True 
 print('\nExpected no. of rows:', expected_rows
      , '\nGot:', len(combined_df2)
      , '\nExpected no. of cols:', expected_cols
      , '\nGot:', len(combined_df2.columns))
 if fail:
    sys.exit()
 #%% check duplicate cols: ones containing suffix '_x' or '_y'
 # should only be position
 foo = combined_df2.filter(regex = r'.*_x|_y', axis = 1)
 print(foo.columns) # should only be position
 # drop position col containing suffix '_y' and then rename col without suffix
 combined_or_df = combined_df2.drop(combined_df2.filter(regex = r'.*_y').columns, axis = 1)
 #combined_or_df['position_x'].head()
 # renaming columns
 #combined_or_df.rename(columns = {'position_x': 'position'}, inplace = True)
 #combined_or_df['position'].head()
 #recheck
 #foo = combined_or_df.filter(regex = r'.*_x|_y', axis = 1)
 #print(foo.columns) # should only be empty
 # remove '_x' from some cols
 import re
 def clean_colnames(colname):
    if re.search('.*_x', colname):
        pos = re.search('.*_x', colname).start()
        return colname[:pos]
    else:
        return colname
 #https://stackoverflow.com/questions/26500156/renaming-column-in-dataframe-for-pandas-using-regular-expression
 combined_or_df.columns
 combined_or_df.rename(columns=lambda x: re.sub('_x$','',x), inplace = True)
 combined_or_df.columns
 #FIXME: this should be 0 when you run the 35k dataset
 combined_or_df['chromosome_number'].isna().sum()
 #%% rearraging columns
 print('Dim of df prefromatting:', combined_or_df.shape)
 print(combined_or_df.columns, '\nshape:', combined_or_df.shape)
 # removing unnecessary column
 combined_or_df = combined_or_df.drop(['symbol'], axis = 1)
 print(combined_or_df.columns, '\nshape:', combined_or_df.shape)
 #%% reorder columns
 #https://stackoverflow.com/questions/13148429/how-to-change-the-order-of-dataframe-columns
 # setting column's order
 output_df = combined_or_df[['mutation',
 'mutationinformation', 
 'wild_type', 
 'position',
 'mutant_type',
 'chr_num_allele',
 'ref_allele',
 'alt_allele', 
 'mut_info', 
 'mut_type', 
 'gene_id', 
 'gene_number', 
 'mut_region', 
 'reference_allele', 
 'alternate_allele', 
 'chromosome_number',
 'af',
 'af_kin',
 'or_kin',
 'or_logistic', 
 'or_mychisq', 
 'est_chisq',
 'or_fisher',
 'ci_low_logistic',
 'ci_hi_logistic',
 'ci_low_fisher',
 'ci_hi_fisher',
 'pwald_kin', 
 'pval_logistic', 
 'pval_fisher',
 'pval_chisq',
 'beta_logistic',
 'beta_kin',
 'se_logistic',
 'se_kin',
 'zval_logistic',
 'logl_H1_kin',
 'l_remle_kin',
 'wt_3let',
 'mt_3let',
 'n_diff',
 'tot_diff',
 'n_miss']]
 # sanity check after rearranging
 if combined_or_df.shape == output_df.shape and set(combined_or_df.columns) == set(output_df.columns):
    print('PASS: Successfully formatted df with rearranged columns')
 else:
    sys.exit('FAIL: something went wrong when rearranging columns!')
 #%% write file
 print('\n====================================================================='
      , '\nWriting output file:\n', outfile
      , '\nNo.of rows:', len(output_df)
      , '\nNo. of cols:', len(output_df.columns))
 output_df.to_csv(outfile, index = False)