From e4608342a4324cade4eb6ef78dd9fc25141f7e68 Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Tue, 8 Sep 2020 17:13:02 +0100
Subject: [PATCH] various changes

---
 scripts/combining_dfs.py                  | 224 +++++++++++++++-------
 scripts/data_extraction.py                |  41 ++--
 scripts/plotting/combining_dfs_plotting.R |  29 ++-
 3 files changed, 199 insertions(+), 95 deletions(-)

diff --git a/scripts/combining_dfs.py b/scripts/combining_dfs.py
index 622e8e5..be62177 100755
--- a/scripts/combining_dfs.py
+++ b/scripts/combining_dfs.py
@@ -55,6 +55,7 @@ os.getcwd()
 #from combining import combine_dfs_with_checks
 from combining_FIXME import detect_common_cols
 from reference_dict import oneletter_aa_dict # CHECK DIR STRUC THERE!
+from reference_dict import low_3letter_dict # CHECK DIR STRUC THERE!
 #=======================================================================
 #%% command line args
 arg_parser = argparse.ArgumentParser()
@@ -79,6 +80,21 @@ gene    = args.gene
 datadir = args.datadir
 indir   = args.input_dir
 outdir  = args.output_dir
+
+gene_match = gene + '_p.'
+print('mut pattern for gene', gene, ':',  gene_match)
+
+nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}'
+print('nsSNP for gene', gene, ':',  nssnp_match)
+
+wt_regex = gene_match.lower()+'([A-Za-z]{3})'
+print('wt regex:', wt_regex)
+
+mut_regex = r'[0-9]+(\w{3})$'
+print('mt regex:', mut_regex)
+
+pos_regex = r'([0-9]+)'
+print('position regex:', pos_regex)
 #%%=======================================================================
 #==============
 # directories
@@ -214,7 +230,9 @@ combined_df[merging_cols_m4].apply(len) == len(combined_df)
 
 # OR merges: TEDIOUSSSS!!!!
 
-
+del(mcsm_df, foldx_df, mcsm_foldx_dfs, dssp_kd_dfs, dssp_kd_rd_dfs,rd_df, kd_df, infile_mcsm, infile_foldx, infile_dssp, infile_kd)
+del(merging_cols_m1, merging_cols_m2, merging_cols_m3, merging_cols_m4)
+del(in_filename_dssp, in_filename_foldx, in_filename_kd, in_filename_mcsm, in_filename_rd)
 #%%
 print('==================================='
       , '\nFifth merge: afor_df + afor_kin_df'
@@ -235,7 +253,7 @@ print('Dim of afor_df:', afor_df.shape
 # finding if ALL afor_kin_df muts are present in afor_df
 # i.e all kinship muts should be PRESENT in mycalcs_present
 if len(afor_kin_df[afor_kin_df['mutation'].isin(afor_df['mutation'])]) == afor_kin_df.shape[0]:
-    print('PASS: ALL or_kinship muts are present in my or list')
+    print('PASS: ALL', len(afor_kin_df), 'or_kinship muts are present in my or list')
 else:
     nf_muts = len(afor_kin_df[~afor_kin_df['mutation'].isin(afor_df['mutation'])])
     nf_muts_df = afor_kin_df[~afor_kin_df['mutation'].isin(afor_df['mutation'])]    
@@ -246,10 +264,10 @@ else:
 
 # Now checking how many afor_df muts are NOT present in afor_kin_df    
 common_muts = len(afor_df[afor_df['mutation'].isin(afor_kin_df['mutation'])])  
-extra_muts_myor =   afor_kin_df.shape[0] - common_muts
+extra_muts_myor = afor_kin_df.shape[0] - common_muts
 
 print('=========================================='
-      , '\nmy or calcs has', extra_muts_myor, 'extra mutations'
+      , '\nmy or calcs has', common_muts, 'present in af_or_kin_df'
       , '\n==========================================')
 
 print('Expected cals for merging with outer_join...')
@@ -257,10 +275,15 @@ print('Expected cals for merging with outer_join...')
 expected_rows = afor_df.shape[0] + extra_muts_myor
 expected_cols = afor_df.shape[1] + afor_kin_df.shape[1] - len(merging_cols_m5)
 
+
+afor_df['mutation']
+afor_kin_df['mutation']
+
 ors_df = pd.merge(afor_df, afor_kin_df, on = merging_cols_m5, how = o_join)
 
 if ors_df.shape[0] == expected_rows and ors_df.shape[1] == expected_cols:
-    print('PASS: OR dfs successfully combined! PHEWWWW!')
+    print('PASS but with duplicate muts: OR dfs successfully combined! PHEWWWW!'
+          , '\nDuplicate muts present but with different \'ref\' and \'alt\' alleles')
 else:
     print('FAIL: could not combine OR dfs'
           , '\nCheck expected rows and cols calculation and join type')
@@ -269,12 +292,12 @@ print('Dim of merged ors_df:', ors_df.shape)
 
 ors_df[merging_cols_m5].apply(len)
 ors_df[merging_cols_m5].apply(len) == len(ors_df)
+
 #%%============================================================================
 # formatting ors_df
 ors_df.columns
 
 # Dropping unncessary columns: already removed in ealier preprocessing
-#cols_to_drop = ['reference_allele', 'alternate_allele', 'symbol' ]
 cols_to_drop = ['n_miss']
 print('Dropping', len(cols_to_drop), 'columns:\n'
       , cols_to_drop)
@@ -327,7 +350,6 @@ column_order = ['mutation'
                 #, 'wt_3let' # old
                 #, 'mt_3let' # old
                 #, 'symbol'
-                #, 'n_miss'
                 ]
 
 if ( (len(column_order) == ors_df.shape[1]) and (DataFrame(column_order).isin(ors_df.columns).all().all())):
@@ -343,6 +365,61 @@ else:
 
 print('\nResult of Sixth merge:', ors_df_ordered.shape
       , '\n===================================================================')
+#%%
+ors_df_ordered.shape
+check = ors_df_ordered[['mutationinformation','mutation', 'wild_type', 'position', 'mutant_type']]
+
+# populating 'nan' info
+lookup_dict = dict()
+for k, v in low_3letter_dict.items():
+    lookup_dict[k] = v['one_letter_code']
+    #print(lookup_dict)
+    
+    wt = ors_df_ordered['mutation'].str.extract(wt_regex).squeeze()
+    #print(wt)
+    ors_df_ordered['wild_type'] = wt.map(lookup_dict)
+    
+    ors_df_ordered['position'] = ors_df_ordered['mutation'].str.extract(pos_regex) 
+    
+    mt = ors_df_ordered['mutation'].str.extract(mut_regex).squeeze()
+    ors_df_ordered['mutant_type'] = mt.map(lookup_dict)
+
+ors_df_ordered['mutationinformation'] = ors_df_ordered['wild_type'] + ors_df_ordered.position.map(str) + ors_df_ordered['mutant_type']
+check = ors_df_ordered[['mutationinformation','mutation', 'wild_type', 'position', 'mutant_type']]
+
+# populate mut_info_f1
+ors_df_ordered['mut_info_f1'].isna().sum()
+ors_df_ordered['mut_info_f1'] = ors_df_ordered['position'].astype(str) + ors_df_ordered['wild_type'] + '>' + ors_df_ordered['position'].astype(str) + ors_df_ordered['mutant_type']
+ors_df_ordered['mut_info_f1'].isna().sum()  
+
+# populate mut_info_f2
+ors_df_ordered['mut_info_f2'] = ors_df_ordered['mutation'].str.replace(gene_match.lower(), 'p.', regex = True)
+
+# populate mut_type
+ors_df_ordered['mut_type'].isna().sum()
+#mut_type_word = ors_df_ordered['mut_type'].value_counts()
+mut_type_word  = 'missense' # FIXME, should be derived
+ors_df_ordered['mut_type'].fillna(mut_type_word, inplace = True)
+ors_df_ordered['mut_type'].isna().sum()
+
+# populate gene_id
+ors_df_ordered['gene_id'].isna().sum()
+#gene_id_word = ors_df_ordered['gene_id'].value_counts()
+gene_id_word  = 'Rv2043c' # FIXME, should be derived
+ors_df_ordered['gene_id'].fillna(gene_id_word, inplace = True)
+ors_df_ordered['gene_id'].isna().sum()
+
+# populate gene_name
+ors_df_ordered['gene_name'].isna().sum()
+ors_df_ordered['gene_name'].value_counts()
+ors_df_ordered['gene_name'].fillna(gene, inplace = True)
+ors_df_ordered['gene_name'].isna().sum()
+
+# check numbers
+ors_df_ordered['or_kin'].isna().sum()
+# should be 0
+ors_df_ordered['or_mychisq'].isna().sum()
+
 #%%============================================================================ 
 print('==================================='
       , '\nSixth merge: Fourth + Fifth merge'
@@ -350,79 +427,94 @@ print('==================================='
       , '\n===================================')
 
 #combined_df_all = combine_dfs_with_checks(combined_df, ors_df_ordered, my_join = i_join)
-merging_cols_m6 = detect_common_cols(combined_df, ors_df_ordered)
+merging_cols_m6 = detect_common_cols(combined_df, ors_df_ordered) 
+
+# dtype problems
+if len(merging_cols_m6) > 1 and 'position'in merging_cols_m6:
+    print('Removing \'position\' from merging_cols_m6 to make dtypes consistent'
+          , '\norig length of merging_cols_m6:', len(merging_cols_m6))
+    merging_cols_m6.remove('position')
+    print('\nlength after removing:', len(merging_cols_m6))
+
 print('Dim of df1:', combined_df.shape
       , '\nDim of df2:', ors_df_ordered.shape
       , '\nNo. of merging_cols:', len(merging_cols_m6))
 
 print('Checking mutations in the two dfs:'
-      , '\nmuts in df1 but NOT in df2:'
+      , '\nmuts in df1 present in df2:'
       , combined_df['mutationinformation'].isin(ors_df_ordered['mutationinformation']).sum()
-      , '\nmuts in df2 but NOT in df1:'
+      , '\nmuts in df2 present in df1:'
       , ors_df_ordered['mutationinformation'].isin(combined_df['mutationinformation']).sum())
 
-#print('\nNo. of common muts:', np.intersect1d(combined_df['mutationinformation'], ors_df_ordered['mutationinformation']) )
-
-combined_df_all = pd.merge(combined_df, ors_df, on = merging_cols_m6, how = l_join)
+#----------
+# merge 6
+#----------
+combined_df_all = pd.merge(combined_df, ors_df_ordered, on = merging_cols_m6, how = o_join)
 combined_df_all.shape
 
-# populate mut_info_f1
-combined_df_all['mut_info_f1'].isna().sum()
-combined_df_all['mut_info_f1'] = combined_df_all['position'].astype(str) + combined_df_all['wild_type'] + '>' + combined_df_all['position'].astype(str) + combined_df_all['mutant_type']
-combined_df_all['mut_info_f1'].isna().sum()  
-
-# populate mut_type
-combined_df_all['mut_type'].isna().sum()
-#mut_type_word = combined_df_all['mut_type'].value_counts()
-mut_type_word  = 'missense' # FIXME, should be derived
-combined_df_all['mut_type'].fillna(mut_type_word, inplace = True)
-combined_df_all['mut_type'].isna().sum()
-
-# populate gene_id
-combined_df_all['gene_id'].isna().sum()
-#gene_id_word = combined_df_all['gene_id'].value_counts()
-gene_id_word  = 'Rv2043c' # FIXME, should be derived
-combined_df_all['gene_id'].fillna(gene_id_word, inplace = True)
-combined_df_all['gene_id'].isna().sum()
-
-# populate gene_name
-combined_df_all['gene_name'].isna().sum()
-combined_df_all['gene_name'].value_counts()
-combined_df_all['gene_name'].fillna(gene, inplace = True)
-combined_df_all['gene_name'].isna().sum()
-
-
-# FIXME: DIM
-# only with left join!
-outdf_expected_rows = len(combined_df)
+# sanity check for merge 6
+outdf_expected_rows = len(combined_df) + extra_muts_myor
+unique_muts = len(combined_df)
 outdf_expected_cols = len(combined_df.columns) + len(ors_df_ordered.columns) - len(merging_cols_m6)
 
-#if combined_df_all.shape[1] == outdf_expected_cols and combined_df_all.shape[0] == outdf_expected_rows:  
-if combined_df_all.shape[1] == outdf_expected_cols and combined_df_all['mutationinformation'].nunique() == outdf_expected_rows:
+if combined_df_all.shape[0] == outdf_expected_rows and combined_df_all.shape[1] == outdf_expected_cols and combined_df_all['mutationinformation'].nunique() == unique_muts:
     print('PASS: Df dimension match'
-          , '\nDim of combined_df_all with join type:', l_join
+          , '\ncombined_df_all with join type:', o_join
           , '\n', combined_df_all.shape
           , '\n===============================================================')
 else:
     print('FAIL: Df dimension mismatch'
              , 'Cannot generate expected dim. See details of merge performed'
              , '\ndf1 dim:', combined_df.shape
-             , '\ndf2 dim:', ors_df.shape
+             , '\ndf2 dim:', ors_df_ordered.shape
              , '\nGot:', combined_df_all.shape
              , '\nmuts in df1 but NOT in df2:'
-             , combined_df['mutationinformation'].isin(ors_df['mutationinformation']).sum()
+             , combined_df['mutationinformation'].isin(ors_df_ordered['mutationinformation']).sum()
              , '\nmuts in df2 but NOT in df1:'
              , ors_df['mutationinformation'].isin(combined_df['mutationinformation']).sum())
     sys.exit()
-    
+ 
+# drop extra cols
+all_cols = combined_df_all.columns
+
+#pos_cols_check = combined_df_all[['position_x','position_y']]
+c = combined_df_all[['position_x','position_y']].isna().sum()
+pos_col_to_drop = c.index[c>0].to_list()
+cols_to_drop = pos_col_to_drop +  ['wild_type_kd']   
+
+print('Dropping', len(cols_to_drop), 'columns:\n', cols_to_drop)
+combined_df_all.drop(cols_to_drop, axis = 1, inplace = True) 
+ 
+# rename position_x to position
+pos_col_to_rename = c.index[c==0].to_list()
+combined_df_all.shape
+combined_df_all.rename(columns = { pos_col_to_rename[0]: 'position'}, inplace = True)
+combined_df_all.shape
+
+all_cols = combined_df_all.columns
+
+#%% reorder cols to for convenience
+first_cols = ['mutationinformation','mutation', 'wild_type', 'position', 'mutant_type']
+last_cols = [col for col in combined_df_all.columns if col not in first_cols]
+
+df = combined_df_all[first_cols+last_cols]
 
 #%% IMPORTANT: check if mutation related info is all populated after this merge  
-# FIXME: should get fixed with JP's resolved dataset!?
-check_nan = combined_df_all.isna().sum(axis = 0)
+# select string colnames to ensure no NA exist there
+string_cols = combined_df_all.columns[combined_df_all.applymap(lambda x: isinstance(x, str)).all(0)]
+
+if (combined_df_all[string_cols].isna().sum(axis = 0)== 0).all():
+    print('PASS: All string cols are populated with no NAs')
+else:
+    print('FAIL: NAs detected in string cols')
+    print(combined_df_all[string_cols].isna().sum(axis = 0))
+    sys.exit()
+
 # relevant mut cols
 check_mut_cols = merging_cols_m5 + merging_cols_m6
 
 count_na_mut_cols = combined_df_all[check_mut_cols].isna().sum().reset_index().rename(columns = {'index': 'col_name', 0: 'na_count'})
+print(check_mut_cols)
 
 if (count_na_mut_cols['na_count'].sum() > 0).any():
     # FIXME: static override, generate 'mutation' from variable
@@ -434,31 +526,29 @@ else:
     print('No missing \'mutation\' has been detected!')
     
     
-lookup_dict = dict()
-for k, v in oneletter_aa_dict.items():
-    lookup_dict[k] = v['three_letter_code_lower']
-    print(lookup_dict)
-    wt_3let = combined_df_all['wild_type'].map(lookup_dict)
-    #print(wt_3let)
-    pos = combined_df_all['position'].astype(str)
-    #print(pos)
-    mt_3let = combined_df_all['mutant_type'].map(lookup_dict)
-    #print(mt_3let)
-    # override the 'mutation' column
-    combined_df_all['mutation'] = 'pnca_p.' + wt_3let + pos + mt_3let
-    print(combined_df_all['mutation'])    
 
-# populate mut_info_f2
-combined_df_all['mut_info_f2'] = combined_df_all['mutation'].str.replace(gene_match.lower(), 'p.', regex = True)
+#cols_to_drop = ['reference_allele', 'alternate_allele', 'symbol' ]
+col_to_drop = ['wild_type_kd']
+print('Dropping', len(col_to_drop), 'columns:\n'
+      , col_to_drop)
+combined_df_all.drop(col_to_drop, axis = 1, inplace = True)
+
+
 
 #%% check
 #cols_check = check_mut_cols + ['mut_info_f1', 'mut_info_f2']
 #foo = combined_df_all[cols_check]
-
+foo = combined_df_all.drop_duplicates('mutationinformation')
+foo2 = combined_df_all.drop_duplicates('mutation')
+poo = combined_df_all[combined_df_all['mutation'].isna()]
 #%%============================================================================
 output_cols = combined_df_all.columns
 #print('Output cols:', output_cols)
-
+#%% drop duplicates
+if combined_df_all.shape[0] != outdf_expected_rows:
+    print('INFORMARIONAL ONLY: combined_df_all has duplicate muts present but with unique ref and alt allele')
+else:
+    print('combined_df_all has no duplicate muts present')
 #%%============================================================================ 
 # write csv
 print('Writing file: combined output of all params needed for plotting and ML')
diff --git a/scripts/data_extraction.py b/scripts/data_extraction.py
index 3f72b1b..f16e136 100755
--- a/scripts/data_extraction.py
+++ b/scripts/data_extraction.py
@@ -81,16 +81,16 @@ gene = args.gene
 gene_match = gene + '_p.'
 print('mut pattern for gene', gene, ':',  gene_match)
 
-nssnp_match = gene_match +'[A-Z]{3}[0-9]+[A-Z]{3}'
+nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}'
 print('nsSNP for gene', gene, ':',  nssnp_match)
 
-wt_regex = gene_match.lower()+'(\w{3})'
+wt_regex = gene_match.lower()+'([A-Za-z]{3})'
 print('wt regex:', wt_regex)
 
-mut_regex = r'\d+(\w{3})$'
+mut_regex = r'[0-9]+(\w{3})$'
 print('mt regex:', mut_regex)
 
-pos_regex = r'(\d+)'
+pos_regex = r'([0-9]+)'
 print('position regex:', pos_regex)
 
 # building cols to extract
@@ -154,30 +154,29 @@ if in_filename_master == 'original_tanushree_data_v2.csv':
 else:
     core_cols = ['id'
                  , 'sample'
-                 , 'patient_id'
-                 , 'strain'
+                 #, 'patient_id'
+                 #, 'strain'
                  , 'lineage'
                  , 'sublineage' 
-                 , 'country'
+                 #, 'country'
                  , 'country_code'
                  , 'geographic_source'
                  #, 'region'
-                 , 'location'
-                 , 'host_body_site'
-                 , 'environment_material'
-                 , 'host_status'
-                 , 'host_sex'
-                 , 'submitted_host_sex'
-                 , 'hiv_status'
-                 , 'HIV_status'
-                 , 'tissue_type'
-                 , 'isolation_source'
+                 #, 'location'
+                 #, 'host_body_site'
+                 #, 'environment_material'
+                 #, 'host_status'
+                 #, 'host_sex'
+                 #, 'submitted_host_sex'
+                 #, 'hiv_status'
+                 #, 'HIV_status'
+                 #, 'tissue_type'
+                 #, 'isolation_source'
                  ,  resistance_col]
     
     variable_based_cols = [drug
                            , dr_muts_col
                            , other_muts_col]
-                           #, resistance_col]
        
     cols_to_extract = core_cols + variable_based_cols
     print('Extracting', len(cols_to_extract), 'columns from master data')
@@ -200,7 +199,7 @@ print('No. of NAs/column:' + '\n', meta_data.isna().sum()
 
 #%% Write check file
 check_file = outdir + '/' + gene.lower() + '_gwas.csv'
-meta_data.to_csv(check_file)
+meta_data.to_csv(check_file, index = False)
 print('Writing subsetted gwas data'
       , '\nFile', check_file
       , '\nDim:', meta_data.shape)
@@ -215,9 +214,9 @@ print('Writing subsetted gwas data'
 # drug counts: complete samples for OR calcs
 meta_data[drug].value_counts() 
 print('===========================================================\n'
-      , 'RESULT: No. of Sus and Res samples:\n', meta_data[drug].value_counts()
+      , 'RESULT: No. of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts()
 	  , '\n===========================================================\n'
-      , 'RESULT: Percentage of Sus and Res samples:\n', meta_data[drug].value_counts(normalize = True)*100
+      , 'RESULT: Percentage of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts(normalize = True)*100
 		, '\n===========================================================')
 
 #%%
diff --git a/scripts/plotting/combining_dfs_plotting.R b/scripts/plotting/combining_dfs_plotting.R
index 7d45d1b..38474d9 100644
--- a/scripts/plotting/combining_dfs_plotting.R
+++ b/scripts/plotting/combining_dfs_plotting.R
@@ -58,6 +58,20 @@ rm(my_df, upos, dup_muts)
 #in_file1: output of plotting_data.R
 # my_df_u
 
+# quick checks
+head(my_df_u[, c("mutation", "mutation2")])
+
+cols_to_extract  = c("mutationinformation", "mutation", "or_mychisq", "or_kin", "af", "af_kin")
+foo = my_df_u[, colnames(my_df_u)%in% cols_to_extract]
+
+
+which(is.na(my_df_u$af_kin)) == which(is.na(my_df_u$af))
+
+
+baz = cbind(my_df_u$mutation, my_df_u$or_mychisq, bar$mutation, bar$or_mychisq)
+colnames(baz) = c("my_df_u_muts", "my_df_u_or", "real_muts", "real_or")
+
+
 # infile 2: gene associated meta data
 #in_filename_gene_metadata = paste0(tolower(gene),  "_meta_data_with_AFandOR.csv")
 in_filename_gene_metadata = paste0(tolower(gene),  "_metadata.csv")
@@ -94,6 +108,7 @@ gene_metadata <- read.csv(infile_gene_metadata
                       , header = T)
 cat("Dim:", dim(gene_metadata))
 
+
 # counting NAs in AF, OR cols: 
 if (identical(sum(is.na(my_df_u$or_mychisq))
               , sum(is.na(my_df_u$pval_fisher))
@@ -230,9 +245,9 @@ if (identical(sum(is.na(merged_df3$or_kin))
 if ( identical( which(is.na(merged_df2$or_mychisq)), which(is.na(merged_df2$or_kin)))
   && identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin)))
   && identical( which(is.na(merged_df2$pval_fisher)), which(is.na(merged_df2$pwald_kin))) ){
-  cat('PASS: Indices match for mychisq and kin ors missing values')
+  cat("PASS: Indices match for mychisq and kin ors missing values")
 } else{
-  cat('Index mismatch: mychisq and kin ors missing indices match')
+  cat("Index mismatch: mychisq and kin ors missing indices match")
   quit()
 }
 
@@ -245,7 +260,7 @@ cat("Merging dfs without any NAs: big df (1-many relationship b/w id & mut)"
     ,"\nfilename: merged_df2_comp")
 
 if ( identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin))) ){
-  print('mychisq and kin ors missing indices match. Procedding with omitting NAs')
+  print("mychisq and kin ors missing indices match. Procedding with omitting NAs")
   na_count_df2 = sum(is.na(merged_df2$af))
   merged_df2_comp = merged_df2[!is.na(merged_df2$af),] 
   # sanity check: no +-1 gymnastics
@@ -262,7 +277,7 @@ if ( identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin))) ){
         ,"\nGot no. of rows: ", nrow(merged_df2_comp))
   }
 }else{
-  print('Index mismatch for mychisq and kin ors. Aborting NA ommission')
+  print("Index mismatch for mychisq and kin ors. Aborting NA ommission")
 }
 
 #=========================
@@ -272,7 +287,7 @@ if ( identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin))) ){
 #=========================
 
 if ( identical( which(is.na(merged_df3$af)), which(is.na(merged_df3$af_kin))) ){
-  print('mychisq and kin ors missing indices match. Procedding with omitting NAs')
+  print("mychisq and kin ors missing indices match. Procedding with omitting NAs")
   na_count_df3 = sum(is.na(merged_df3$af))
   #merged_df3_comp = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] # a way
   merged_df3_comp = merged_df3[!is.na(merged_df3$af),] # another way
@@ -289,7 +304,7 @@ if ( identical( which(is.na(merged_df3$af)), which(is.na(merged_df3$af_kin))) ){
         ,"\nGot no. of rows: ", nrow(merged_df3_comp))
    }
 } else{
-  print('Index mismatch for mychisq and kin ors. Aborting NA ommission')
+  print("Index mismatch for mychisq and kin ors. Aborting NA ommission")
 } 
   
 # alternate way of deriving merged_df3_comp
@@ -347,7 +362,7 @@ merged_df3_comp_lig = merged_df3_comp[merged_df3_comp$ligand_distance<10,]
 if (nrow(merged_df3_lig) == nrow(my_df_u_lig)){
   print("PASS: verified merged_df3_lig")
 }else{
-  cat(paste0('FAIL: nrow mismatch for merged_df3_lig'
+  cat(paste0("FAIL: nrow mismatch for merged_df3_lig"
              , "\nExpected:", nrow(my_df_u_lig)
              , "\nGot:", nrow(merged_df3_lig)))
 }