diff --git a/scripts/combine_afs_ors.py b/scripts/combine_afs_ors.py index 3f5558c..fb21980 100755 --- a/scripts/combine_afs_ors.py +++ b/scripts/combine_afs_ors.py @@ -92,22 +92,22 @@ del(in_filename_afor, in_filename_afor_kin, datadir, indir, outdir) # read input csv files to combine #======================== snpinfo_df = pd.read_csv(infile0, sep = ',') -snpinfo_ncols = len(snpinfo_df.columns) -snpinfo_nrows = len(snpinfo_df) -print('No. of rows in', infile0, ':', snpinfo_nrows - , '\nNo. of cols in', infile0, ':', snpinfo_ncols) +#snpinfo_ncols = len(snpinfo_df.columns) +#snpinfo.shape[0] = len(snpinfo_df) +print('No. of rows in', infile0, ':', snpinfo_df.shape[0] + , '\nNo. of cols in', infile0, ':', snpinfo_df.shape[1]) afor_df = pd.read_csv(infile1, sep = ',') -afor_ncols = len(afor_df.columns) -afor_nrows = len(afor_df) -print('No. of rows in', infile1, ':', afor_nrows - , '\nNo. of cols in', infile1, ':', afor_ncols) +#afor_ncols = len(afor_df.columns) +#afor.shape[0] = len(afor_df) +print('No. of rows in', infile1, ':', afor_df.shape[0] + , '\nNo. of cols in', infile1, ':', afor_df.shape[1]) afor_kin_df = pd.read_csv(infile2, sep = ',') -afor_kin_nrows = len(afor_kin_df) -afor_kin_ncols = len(afor_kin_df.columns) -print('No. of rows in', infile2, ':', afor_kin_nrows - , '\nNo. of cols in', infile2, ':', afor_kin_ncols) +#afor_kin.shape[0] = len(afor_kin_df) +#afor_kin_ncols = len(afor_kin_df.columns) +print('No. of rows in', infile2, ':', afor_kin_df.shape[0] + , '\nNo. of cols in', infile2, ':', afor_kin_df.shape[1]) #%% Process afor_df #1) pull all snp_info so you have ref_allele, etc @@ -115,19 +115,14 @@ print('No. of rows in', infile2, ':', afor_kin_nrows # find merging column left_df = afor_df.copy() -left_df_nrows = len(left_df) -left_df_ncols = len(left_df.columns) - right_df = snpinfo_df.copy() -right_df_nrows = len(right_df) -right_df_ncols = len(right_df.columns) common_cols = np.intersect1d(left_df.columns, right_df.columns).tolist() print('Length of common cols:', len(common_cols) , '\ncommon column/s:', common_cols, 'type:', type(common_cols)) -print('selecting consistent dtypes for merging (object i.e string)') #https://stackoverflow.com/questions/44639772/python-pandas-column-dtype-object-causing-merge-to-fail-with-dtypewarning-colu +print('selecting consistent dtypes for merging (object i.e string)') merging_cols = left_df[common_cols].select_dtypes(include = [object]).columns.tolist() print(merging_cols) nmerging_cols = len(merging_cols) @@ -138,24 +133,19 @@ print(' length of merging cols:', nmerging_cols print('Checking for duplicates in common col:', common_cols , '\nNo of duplicates:' , len(right_df[right_df.duplicated(common_cols)]) - , '\noriginal length:', right_df_nrows) + , '\noriginal length:', right_df.shape[0]) right_df = right_df[~right_df.duplicated(common_cols)] -right_df_nrows = len(right_df) -print('\nrevised length:', right_df_nrows) +print('\nrevised length:', right_df.shape[0]) # checking cross-over of mutations in the two dfs to merge -ndiff1 = afor_nrows - afor_df['mutation'].isin(snpinfo_df['mutation']).sum() +ndiff1 = left_df.shape[0] - left_df['mutation'].isin(right_df['mutation']).sum() print('There are', ndiff1, 'mutations with OR, but no snp_info' , '\nExtracting and writing out file') - -#afor_df[afor_df['mutation'].isin(snpinfo_df['mutation'])] -missing_mutinfo = afor_df[~afor_df['mutation'].isin(snpinfo_df['mutation'])] -#len(missing_mutinfo.duplicated(common_cols)) - +missing_mutinfo = left_df[~left_df['mutation'].isin(right_df['mutation'])] #missing_mutinfo.to_csv('infoless_muts.csv') -ndiff2 = snpinfo_nrows - snpinfo_df['mutation'].isin(afor_df['mutation']).sum() +ndiff2 = right_df.shape[0] - right_df['mutation'].isin(left_df['mutation']).sum() print('There are', ndiff2, 'mutations that do not have OR, but have snp_info') # Define join type @@ -166,20 +156,19 @@ my_join = 'left' print('combing with join:', my_join) combined_df1 = pd.merge(left_df, right_df, on = merging_cols, how = my_join) -print('nrows:', len(combined_df1) - , '\nshape:', combined_df1.shape) +print('\nshape:', combined_df1.shape) # inner = 252 -left_df_nrows - ndiff1 +left_df.shape[0] - ndiff1 # outer = 331 -right_df_nrows + ndiff1 +right_df.shape[0] + ndiff1 # right = 290 -right_df_nrows +right_df.shape[0] # left = 293 -left_df_nrows +left_df.shape[0] #%% @@ -195,19 +184,19 @@ print('combing with:', my_join) combined_df1 = pd.merge(left_df, right_df, on = merging_cols, how = my_join) if my_join == 'inner': - #expected_rows = left_df_nrows - ndiff1 + #expected_rows = left_df.shape[0] - ndiff1 expected_rows = left_df.shape[0] - ndiff1 if my_join == 'outer': - #expected_rows = right_df_nrows + ndiff1 + #expected_rows = right_df.shape[0] + ndiff1 expected_rows = right_df.shape[0] + ndiff1 if my_join == 'right': - #expected_rows = right_df_nrows + #expected_rows = right_df.shape[0] expected_rows = right_df.shape[0] if my_join == 'left': - #expected_rows = left_df_nrows + #expected_rows = left_df.shape[0] expected_rows = left_df.shape[0] expected_cols = left_df.shape[1] + right_df.shape[1] - nmerging_cols @@ -224,9 +213,186 @@ print('\nExpected no. of rows:', expected_rows if fail: sys.exit() -# update nrows and ncols -afor_info_nrows = len(afor_info_df) -afor_info_ncols = len(afor_info_df.columns) -#%% +# delete variables +del(left_df, right_df, common_cols, merging_cols, nmerging_cols, my_join, ndiff1, ndiff2, missing_mutinfo + , expected_rows, expected_cols, fail) +del(afor_df, snpinfo_df) + + + + + + +#%% Second merge: combined_df1 and afor_kin_df + +left_df = combined_df1.copy() +right_df = afor_kin_df.copy() + +common_cols = np.intersect1d(left_df.columns, right_df.columns).tolist() +print('Length of common cols:', len(common_cols) + , '\ncommon column/s:', common_cols, 'type:', type(common_cols)) + +#https://stackoverflow.com/questions/44639772/python-pandas-column-dtype-object-causing-merge-to-fail-with-dtypewarning-colu +print('selecting consistent dtypes for merging (object i.e string)') + +#FIXME + +#merging_cols = left_df[common_cols].select_dtypes(include = [object]).columns.tolist() +merging_cols = ['wild_type', 'mutant_type', 'mutationinformation'] +nmerging_cols_cols = len(merging_cols) + +print(merging_cols) +nmerging_cols = len(merging_cols) +print(' length of merging cols:', nmerging_cols + , '\nmerging cols:', merging_cols, 'type:', type(merging_cols)) + +ndiff1 = left_df.shape[0] - left_df['mutationinformation'].isin(right_df['mutationinformation']).sum() +print('There are', ndiff1, 'mutations with OR, but not in OR kinship' + , '\nExtracting and writing out file') +missing_mutinfo = left_df[~left_df['mutationinformation'].isin(right_df['mutationinformation'])] +#missing_mutinfo.to_csv('infoless_muts.csv') + +ndiff2 = right_df.shape[0] - right_df['mutationinformation'].isin(left_df['mutationinformation']).sum() +print('There are', ndiff2, 'mutations that do not have OR, but have OR kinship') + +my_join = 'outer' + +fail = False +print('combing with:', my_join) +combined_df2 = pd.merge(left_df, right_df, on = merging_cols, how = my_join) + +if my_join == 'inner': + #expected_rows = left_df.shape[0] - ndiff1 + expected_rows = left_df.shape[0] - ndiff1 + +if my_join == 'outer': + #expected_rows = right_df.shape[0] + ndiff1 + expected_rows = right_df.shape[0] + ndiff1 + +if my_join == 'right': + #expected_rows = right_df.shape[0] + expected_rows = right_df.shape[0] + +if my_join == 'left': + #expected_rows = left_df.shape[0] + expected_rows = left_df.shape[0] + +expected_cols = left_df.shape[1] + right_df.shape[1] - nmerging_cols + +if len(combined_df2) == expected_rows and len(combined_df2.columns) == expected_cols: + print('PASS: successfully combined dfs with:', my_join, 'join') +else: + print('FAIL: combined_df\'s expected rows and cols not matched') + fail = True +print('\nExpected no. of rows:', expected_rows + , '\nGot:', len(combined_df2) + , '\nExpected no. of cols:', expected_cols + , '\nGot:', len(combined_df2.columns)) +if fail: + sys.exit() +#%% check duplicate cols: ones containing suffix '_x' or '_y' +# should only be position +foo = combined_df2.filter(regex = r'.*_x|_y', axis = 1) +print(foo.columns) # should only be position + +# drop position col containing suffix '_y' and then rename col without suffix +combined_or_df = combined_df2.drop(combined_df2.filter(regex = r'.*_y').columns, axis = 1) +#combined_or_df['position_x'].head() + +# renaming columns +#combined_or_df.rename(columns = {'position_x': 'position'}, inplace = True) +#combined_or_df['position'].head() +#recheck +#foo = combined_or_df.filter(regex = r'.*_x|_y', axis = 1) +#print(foo.columns) # should only be empty + + +# remove '_x' from some cols + +import re +def clean_colnames(colname): + + if re.search('.*_x', colname): + pos = re.search('.*_x', colname).start() + return colname[:pos] + else: + return colname + +#https://stackoverflow.com/questions/26500156/renaming-column-in-dataframe-for-pandas-using-regular-expression +combined_or_df.columns +combined_or_df.rename(columns=lambda x: re.sub('_x$','',x), inplace = True) +combined_or_df.columns + +#FIXME: this should be 0 when you run the 35k dataset +combined_or_df['chromosome_number'].isna().sum() + +#%% rearraging columns +print('Dim of df prefromatting:', combined_or_df.shape) + +print(combined_or_df.columns, '\nshape:', combined_or_df.shape) + +# removing unnecessary column +combined_or_df = combined_or_df.drop(['symbol'], axis = 1) +print(combined_or_df.columns, '\nshape:', combined_or_df.shape) +#%% reorder columns +#https://stackoverflow.com/questions/13148429/how-to-change-the-order-of-dataframe-columns +# setting column's order +output_df = combined_or_df[['mutation', + 'mutationinformation', + 'wild_type', + 'position', + 'mutant_type', + 'chr_num_allele', + 'ref_allele', + 'alt_allele', + 'mut_info', + 'mut_type', + 'gene_id', + 'gene_number', + 'mut_region', + 'reference_allele', + 'alternate_allele', + 'chromosome_number', + 'af', + 'af_kin', + 'or_kin', + 'or_logistic', + 'or_mychisq', + 'est_chisq', + 'or_fisher', + 'ci_low_logistic', + 'ci_hi_logistic', + 'ci_low_fisher', + 'ci_hi_fisher', + 'pwald_kin', + 'pval_logistic', + 'pval_fisher', + 'pval_chisq', + 'beta_logistic', + 'beta_kin', + 'se_logistic', + 'se_kin', + 'zval_logistic', + 'logl_H1_kin', + 'l_remle_kin', + 'wt_3let', + 'mt_3let', + 'n_diff', + 'tot_diff', + 'n_miss']] + +# sanity check after rearranging +if combined_or_df.shape == output_df.shape and set(combined_or_df.columns) == set(output_df.columns): + print('PASS: Successfully formatted df with rearranged columns') +else: + sys.exit('FAIL: something went wrong when rearranging columns!') + +#%% write file +print('\n=====================================================================' + , '\nWriting output file:\n', outfile + , '\nNo.of rows:', len(output_df) + , '\nNo. of cols:', len(output_df.columns)) +output_df.to_csv(outfile, index = False) +