various changes

2020-09-08 17:13:02 +01:00 · 2020-09-08 17:13:02 +01:00 · fe49a45447
commit fe49a45447
parent 5d9561f88a
3 changed files with 199 additions and 95 deletions
--- a/scripts/combining_dfs.py
+++ b/scripts/combining_dfs.py
@ -55,6 +55,7 @@ os.getcwd()
 #from combining import combine_dfs_with_checks
 from combining_FIXME import detect_common_cols
 from reference_dict import oneletter_aa_dict # CHECK DIR STRUC THERE!
+from reference_dict import low_3letter_dict # CHECK DIR STRUC THERE!
 #=======================================================================
 #%% command line args
 arg_parser = argparse.ArgumentParser()
@ -79,6 +80,21 @@ gene    = args.gene
 datadir = args.datadir
 indir   = args.input_dir
 outdir  = args.output_dir
+
+gene_match = gene + '_p.'
+print('mut pattern for gene', gene, ':',  gene_match)
+
+nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}'
+print('nsSNP for gene', gene, ':',  nssnp_match)
+
+wt_regex = gene_match.lower()+'([A-Za-z]{3})'
+print('wt regex:', wt_regex)
+
+mut_regex = r'[0-9]+(\w{3})$'
+print('mt regex:', mut_regex)
+
+pos_regex = r'([0-9]+)'
+print('position regex:', pos_regex)
 #%%=======================================================================
 #==============
 # directories
@ -214,7 +230,9 @@ combined_df[merging_cols_m4].apply(len) == len(combined_df)

 # OR merges: TEDIOUSSSS!!!!

-
+del(mcsm_df, foldx_df, mcsm_foldx_dfs, dssp_kd_dfs, dssp_kd_rd_dfs,rd_df, kd_df, infile_mcsm, infile_foldx, infile_dssp, infile_kd)
+del(merging_cols_m1, merging_cols_m2, merging_cols_m3, merging_cols_m4)
+del(in_filename_dssp, in_filename_foldx, in_filename_kd, in_filename_mcsm, in_filename_rd)
 #%%
 print('==================================='
      , '\nFifth merge: afor_df + afor_kin_df'
@ -235,7 +253,7 @@ print('Dim of afor_df:', afor_df.shape
 # finding if ALL afor_kin_df muts are present in afor_df
 # i.e all kinship muts should be PRESENT in mycalcs_present
 if len(afor_kin_df[afor_kin_df['mutation'].isin(afor_df['mutation'])]) == afor_kin_df.shape[0]:
-    print('PASS: ALL or_kinship muts are present in my or list')
+    print('PASS: ALL', len(afor_kin_df), 'or_kinship muts are present in my or list')
 else:
    nf_muts = len(afor_kin_df[~afor_kin_df['mutation'].isin(afor_df['mutation'])])
    nf_muts_df = afor_kin_df[~afor_kin_df['mutation'].isin(afor_df['mutation'])]    
@ -246,10 +264,10 @@ else:

 # Now checking how many afor_df muts are NOT present in afor_kin_df    
 common_muts = len(afor_df[afor_df['mutation'].isin(afor_kin_df['mutation'])])  
-extra_muts_myor =   afor_kin_df.shape[0] - common_muts
+extra_muts_myor = afor_kin_df.shape[0] - common_muts

 print('=========================================='
-      , '\nmy or calcs has', extra_muts_myor, 'extra mutations'
+      , '\nmy or calcs has', common_muts, 'present in af_or_kin_df'
      , '\n==========================================')

 print('Expected cals for merging with outer_join...')
@ -257,10 +275,15 @@ print('Expected cals for merging with outer_join...')
 expected_rows = afor_df.shape[0] + extra_muts_myor
 expected_cols = afor_df.shape[1] + afor_kin_df.shape[1] - len(merging_cols_m5)

+
+afor_df['mutation']
+afor_kin_df['mutation']
+
 ors_df = pd.merge(afor_df, afor_kin_df, on = merging_cols_m5, how = o_join)

 if ors_df.shape[0] == expected_rows and ors_df.shape[1] == expected_cols:
-    print('PASS: OR dfs successfully combined! PHEWWWW!')
+    print('PASS but with duplicate muts: OR dfs successfully combined! PHEWWWW!'
+          , '\nDuplicate muts present but with different \'ref\' and \'alt\' alleles')
 else:
    print('FAIL: could not combine OR dfs'
          , '\nCheck expected rows and cols calculation and join type')
@ -269,12 +292,12 @@ print('Dim of merged ors_df:', ors_df.shape)

 ors_df[merging_cols_m5].apply(len)
 ors_df[merging_cols_m5].apply(len) == len(ors_df)
+
 #%%============================================================================
 # formatting ors_df
 ors_df.columns

 # Dropping unncessary columns: already removed in ealier preprocessing
-#cols_to_drop = ['reference_allele', 'alternate_allele', 'symbol' ]
 cols_to_drop = ['n_miss']
 print('Dropping', len(cols_to_drop), 'columns:\n'
      , cols_to_drop)
@ -327,7 +350,6 @@ column_order = ['mutation'
                #, 'wt_3let' # old
                #, 'mt_3let' # old
                #, 'symbol'
-                #, 'n_miss'
                ]

 if ( (len(column_order) == ors_df.shape[1]) and (DataFrame(column_order).isin(ors_df.columns).all().all())):
@ -343,6 +365,61 @@ else:

 print('\nResult of Sixth merge:', ors_df_ordered.shape
      , '\n===================================================================')
+#%%
+ors_df_ordered.shape
+check = ors_df_ordered[['mutationinformation','mutation', 'wild_type', 'position', 'mutant_type']]
+
+# populating 'nan' info
+lookup_dict = dict()
+for k, v in low_3letter_dict.items():
+    lookup_dict[k] = v['one_letter_code']
+    #print(lookup_dict)
+    
+    wt = ors_df_ordered['mutation'].str.extract(wt_regex).squeeze()
+    #print(wt)
+    ors_df_ordered['wild_type'] = wt.map(lookup_dict)
+    
+    ors_df_ordered['position'] = ors_df_ordered['mutation'].str.extract(pos_regex) 
+    
+    mt = ors_df_ordered['mutation'].str.extract(mut_regex).squeeze()
+    ors_df_ordered['mutant_type'] = mt.map(lookup_dict)
+
+ors_df_ordered['mutationinformation'] = ors_df_ordered['wild_type'] + ors_df_ordered.position.map(str) + ors_df_ordered['mutant_type']
+check = ors_df_ordered[['mutationinformation','mutation', 'wild_type', 'position', 'mutant_type']]
+
+# populate mut_info_f1
+ors_df_ordered['mut_info_f1'].isna().sum()
+ors_df_ordered['mut_info_f1'] = ors_df_ordered['position'].astype(str) + ors_df_ordered['wild_type'] + '>' + ors_df_ordered['position'].astype(str) + ors_df_ordered['mutant_type']
+ors_df_ordered['mut_info_f1'].isna().sum()  
+
+# populate mut_info_f2
+ors_df_ordered['mut_info_f2'] = ors_df_ordered['mutation'].str.replace(gene_match.lower(), 'p.', regex = True)
+
+# populate mut_type
+ors_df_ordered['mut_type'].isna().sum()
+#mut_type_word = ors_df_ordered['mut_type'].value_counts()
+mut_type_word  = 'missense' # FIXME, should be derived
+ors_df_ordered['mut_type'].fillna(mut_type_word, inplace = True)
+ors_df_ordered['mut_type'].isna().sum()
+
+# populate gene_id
+ors_df_ordered['gene_id'].isna().sum()
+#gene_id_word = ors_df_ordered['gene_id'].value_counts()
+gene_id_word  = 'Rv2043c' # FIXME, should be derived
+ors_df_ordered['gene_id'].fillna(gene_id_word, inplace = True)
+ors_df_ordered['gene_id'].isna().sum()
+
+# populate gene_name
+ors_df_ordered['gene_name'].isna().sum()
+ors_df_ordered['gene_name'].value_counts()
+ors_df_ordered['gene_name'].fillna(gene, inplace = True)
+ors_df_ordered['gene_name'].isna().sum()
+
+# check numbers
+ors_df_ordered['or_kin'].isna().sum()
+# should be 0
+ors_df_ordered['or_mychisq'].isna().sum()
+
 #%%============================================================================ 
 print('==================================='
      , '\nSixth merge: Fourth + Fifth merge'
@ -351,78 +428,93 @@ print('==================================='

 #combined_df_all = combine_dfs_with_checks(combined_df, ors_df_ordered, my_join = i_join)
 merging_cols_m6 = detect_common_cols(combined_df, ors_df_ordered) 
+
+# dtype problems
+if len(merging_cols_m6) > 1 and 'position'in merging_cols_m6:
+    print('Removing \'position\' from merging_cols_m6 to make dtypes consistent'
+          , '\norig length of merging_cols_m6:', len(merging_cols_m6))
+    merging_cols_m6.remove('position')
+    print('\nlength after removing:', len(merging_cols_m6))
+
 print('Dim of df1:', combined_df.shape
      , '\nDim of df2:', ors_df_ordered.shape
      , '\nNo. of merging_cols:', len(merging_cols_m6))

 print('Checking mutations in the two dfs:'
-      , '\nmuts in df1 but NOT in df2:'
+      , '\nmuts in df1 present in df2:'
      , combined_df['mutationinformation'].isin(ors_df_ordered['mutationinformation']).sum()
-      , '\nmuts in df2 but NOT in df1:'
+      , '\nmuts in df2 present in df1:'
      , ors_df_ordered['mutationinformation'].isin(combined_df['mutationinformation']).sum())

-#print('\nNo. of common muts:', np.intersect1d(combined_df['mutationinformation'], ors_df_ordered['mutationinformation']) )
-
-combined_df_all = pd.merge(combined_df, ors_df, on = merging_cols_m6, how = l_join)
+#----------
+# merge 6
+#----------
+combined_df_all = pd.merge(combined_df, ors_df_ordered, on = merging_cols_m6, how = o_join)
 combined_df_all.shape

-# populate mut_info_f1
-combined_df_all['mut_info_f1'].isna().sum()
-combined_df_all['mut_info_f1'] = combined_df_all['position'].astype(str) + combined_df_all['wild_type'] + '>' + combined_df_all['position'].astype(str) + combined_df_all['mutant_type']
-combined_df_all['mut_info_f1'].isna().sum()  
-
-# populate mut_type
-combined_df_all['mut_type'].isna().sum()
-#mut_type_word = combined_df_all['mut_type'].value_counts()
-mut_type_word  = 'missense' # FIXME, should be derived
-combined_df_all['mut_type'].fillna(mut_type_word, inplace = True)
-combined_df_all['mut_type'].isna().sum()
-
-# populate gene_id
-combined_df_all['gene_id'].isna().sum()
-#gene_id_word = combined_df_all['gene_id'].value_counts()
-gene_id_word  = 'Rv2043c' # FIXME, should be derived
-combined_df_all['gene_id'].fillna(gene_id_word, inplace = True)
-combined_df_all['gene_id'].isna().sum()
-
-# populate gene_name
-combined_df_all['gene_name'].isna().sum()
-combined_df_all['gene_name'].value_counts()
-combined_df_all['gene_name'].fillna(gene, inplace = True)
-combined_df_all['gene_name'].isna().sum()
-
-
-# FIXME: DIM
-# only with left join!
-outdf_expected_rows = len(combined_df)
+# sanity check for merge 6
+outdf_expected_rows = len(combined_df) + extra_muts_myor
+unique_muts = len(combined_df)
 outdf_expected_cols = len(combined_df.columns) + len(ors_df_ordered.columns) - len(merging_cols_m6)

-#if combined_df_all.shape[1] == outdf_expected_cols and combined_df_all.shape[0] == outdf_expected_rows:  
-if combined_df_all.shape[1] == outdf_expected_cols and combined_df_all['mutationinformation'].nunique() == outdf_expected_rows:
+if combined_df_all.shape[0] == outdf_expected_rows and combined_df_all.shape[1] == outdf_expected_cols and combined_df_all['mutationinformation'].nunique() == unique_muts:
    print('PASS: Df dimension match'
-          , '\nDim of combined_df_all with join type:', l_join
+          , '\ncombined_df_all with join type:', o_join
          , '\n', combined_df_all.shape
          , '\n===============================================================')
 else:
    print('FAIL: Df dimension mismatch'
             , 'Cannot generate expected dim. See details of merge performed'
             , '\ndf1 dim:', combined_df.shape
-             , '\ndf2 dim:', ors_df.shape
+             , '\ndf2 dim:', ors_df_ordered.shape
             , '\nGot:', combined_df_all.shape
             , '\nmuts in df1 but NOT in df2:'
-             , combined_df['mutationinformation'].isin(ors_df['mutationinformation']).sum()
+             , combined_df['mutationinformation'].isin(ors_df_ordered['mutationinformation']).sum()
             , '\nmuts in df2 but NOT in df1:'
             , ors_df['mutationinformation'].isin(combined_df['mutationinformation']).sum())
    sys.exit()
 
+# drop extra cols
+all_cols = combined_df_all.columns
+
+#pos_cols_check = combined_df_all[['position_x','position_y']]
+c = combined_df_all[['position_x','position_y']].isna().sum()
+pos_col_to_drop = c.index[c>0].to_list()
+cols_to_drop = pos_col_to_drop +  ['wild_type_kd']   
+
+print('Dropping', len(cols_to_drop), 'columns:\n', cols_to_drop)
+combined_df_all.drop(cols_to_drop, axis = 1, inplace = True) 
+ 
+# rename position_x to position
+pos_col_to_rename = c.index[c==0].to_list()
+combined_df_all.shape
+combined_df_all.rename(columns = { pos_col_to_rename[0]: 'position'}, inplace = True)
+combined_df_all.shape
+
+all_cols = combined_df_all.columns
+
+#%% reorder cols to for convenience
+first_cols = ['mutationinformation','mutation', 'wild_type', 'position', 'mutant_type']
+last_cols = [col for col in combined_df_all.columns if col not in first_cols]
+
+df = combined_df_all[first_cols+last_cols]

 #%% IMPORTANT: check if mutation related info is all populated after this merge  
-# FIXME: should get fixed with JP's resolved dataset!?
-check_nan = combined_df_all.isna().sum(axis = 0)
+# select string colnames to ensure no NA exist there
+string_cols = combined_df_all.columns[combined_df_all.applymap(lambda x: isinstance(x, str)).all(0)]
+
+if (combined_df_all[string_cols].isna().sum(axis = 0)== 0).all():
+    print('PASS: All string cols are populated with no NAs')
+else:
+    print('FAIL: NAs detected in string cols')
+    print(combined_df_all[string_cols].isna().sum(axis = 0))
+    sys.exit()
+
 # relevant mut cols
 check_mut_cols = merging_cols_m5 + merging_cols_m6

 count_na_mut_cols = combined_df_all[check_mut_cols].isna().sum().reset_index().rename(columns = {'index': 'col_name', 0: 'na_count'})
+print(check_mut_cols)

 if (count_na_mut_cols['na_count'].sum() > 0).any():
    # FIXME: static override, generate 'mutation' from variable
@ -434,31 +526,29 @@ else:
    print('No missing \'mutation\' has been detected!')
    
    
-lookup_dict = dict()
-for k, v in oneletter_aa_dict.items():
-    lookup_dict[k] = v['three_letter_code_lower']
-    print(lookup_dict)
-    wt_3let = combined_df_all['wild_type'].map(lookup_dict)
-    #print(wt_3let)
-    pos = combined_df_all['position'].astype(str)
-    #print(pos)
-    mt_3let = combined_df_all['mutant_type'].map(lookup_dict)
-    #print(mt_3let)
-    # override the 'mutation' column
-    combined_df_all['mutation'] = 'pnca_p.' + wt_3let + pos + mt_3let
-    print(combined_df_all['mutation'])    

-# populate mut_info_f2
-combined_df_all['mut_info_f2'] = combined_df_all['mutation'].str.replace(gene_match.lower(), 'p.', regex = True)
+#cols_to_drop = ['reference_allele', 'alternate_allele', 'symbol' ]
+col_to_drop = ['wild_type_kd']
+print('Dropping', len(col_to_drop), 'columns:\n'
+      , col_to_drop)
+combined_df_all.drop(col_to_drop, axis = 1, inplace = True)
+
+

 #%% check
 #cols_check = check_mut_cols + ['mut_info_f1', 'mut_info_f2']
 #foo = combined_df_all[cols_check]
-
+foo = combined_df_all.drop_duplicates('mutationinformation')
+foo2 = combined_df_all.drop_duplicates('mutation')
+poo = combined_df_all[combined_df_all['mutation'].isna()]
 #%%============================================================================
 output_cols = combined_df_all.columns
 #print('Output cols:', output_cols)
-
+#%% drop duplicates
+if combined_df_all.shape[0] != outdf_expected_rows:
+    print('INFORMARIONAL ONLY: combined_df_all has duplicate muts present but with unique ref and alt allele')
+else:
+    print('combined_df_all has no duplicate muts present')
 #%%============================================================================ 
 # write csv
 print('Writing file: combined output of all params needed for plotting and ML')
--- a/scripts/data_extraction.py
+++ b/scripts/data_extraction.py
@ -81,16 +81,16 @@ gene = args.gene
 gene_match = gene + '_p.'
 print('mut pattern for gene', gene, ':',  gene_match)

-nssnp_match = gene_match +'[A-Z]{3}[0-9]+[A-Z]{3}'
+nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}'
 print('nsSNP for gene', gene, ':',  nssnp_match)

-wt_regex = gene_match.lower()+'(\w{3})'
+wt_regex = gene_match.lower()+'([A-Za-z]{3})'
 print('wt regex:', wt_regex)

-mut_regex = r'\d+(\w{3})$'
+mut_regex = r'[0-9]+(\w{3})$'
 print('mt regex:', mut_regex)

-pos_regex = r'(\d+)'
+pos_regex = r'([0-9]+)'
 print('position regex:', pos_regex)

 # building cols to extract
@ -154,30 +154,29 @@ if in_filename_master == 'original_tanushree_data_v2.csv':
 else:
    core_cols = ['id'
                 , 'sample'
-                 , 'patient_id'
-                 , 'strain'
+                 #, 'patient_id'
+                 #, 'strain'
                 , 'lineage'
                 , 'sublineage' 
-                 , 'country'
+                 #, 'country'
                 , 'country_code'
                 , 'geographic_source'
                 #, 'region'
-                 , 'location'
-                 , 'host_body_site'
-                 , 'environment_material'
-                 , 'host_status'
-                 , 'host_sex'
-                 , 'submitted_host_sex'
-                 , 'hiv_status'
-                 , 'HIV_status'
-                 , 'tissue_type'
-                 , 'isolation_source'
+                 #, 'location'
+                 #, 'host_body_site'
+                 #, 'environment_material'
+                 #, 'host_status'
+                 #, 'host_sex'
+                 #, 'submitted_host_sex'
+                 #, 'hiv_status'
+                 #, 'HIV_status'
+                 #, 'tissue_type'
+                 #, 'isolation_source'
                 ,  resistance_col]
    
    variable_based_cols = [drug
                           , dr_muts_col
                           , other_muts_col]
-                           #, resistance_col]
       
    cols_to_extract = core_cols + variable_based_cols
    print('Extracting', len(cols_to_extract), 'columns from master data')
@ -200,7 +199,7 @@ print('No. of NAs/column:' + '\n', meta_data.isna().sum()

 #%% Write check file
 check_file = outdir + '/' + gene.lower() + '_gwas.csv'
-meta_data.to_csv(check_file)
+meta_data.to_csv(check_file, index = False)
 print('Writing subsetted gwas data'
      , '\nFile', check_file
      , '\nDim:', meta_data.shape)
@ -215,9 +214,9 @@ print('Writing subsetted gwas data'
 # drug counts: complete samples for OR calcs
 meta_data[drug].value_counts() 
 print('===========================================================\n'
-      , 'RESULT: No. of Sus and Res samples:\n', meta_data[drug].value_counts()
+      , 'RESULT: No. of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts()
 	  , '\n===========================================================\n'
-      , 'RESULT: Percentage of Sus and Res samples:\n', meta_data[drug].value_counts(normalize = True)*100
+      , 'RESULT: Percentage of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts(normalize = True)*100
 		, '\n===========================================================')

 #%%
--- a/scripts/plotting/combining_dfs_plotting.R
+++ b/scripts/plotting/combining_dfs_plotting.R
@ -58,6 +58,20 @@ rm(my_df, upos, dup_muts)
 #in_file1: output of plotting_data.R
 # my_df_u

+# quick checks
+head(my_df_u[, c("mutation", "mutation2")])
+
+cols_to_extract  = c("mutationinformation", "mutation", "or_mychisq", "or_kin", "af", "af_kin")
+foo = my_df_u[, colnames(my_df_u)%in% cols_to_extract]
+
+
+which(is.na(my_df_u$af_kin)) == which(is.na(my_df_u$af))
+
+
+baz = cbind(my_df_u$mutation, my_df_u$or_mychisq, bar$mutation, bar$or_mychisq)
+colnames(baz) = c("my_df_u_muts", "my_df_u_or", "real_muts", "real_or")
+
+
 # infile 2: gene associated meta data
 #in_filename_gene_metadata = paste0(tolower(gene),  "_meta_data_with_AFandOR.csv")
 in_filename_gene_metadata = paste0(tolower(gene),  "_metadata.csv")
@ -94,6 +108,7 @@ gene_metadata <- read.csv(infile_gene_metadata
                      , header = T)
 cat("Dim:", dim(gene_metadata))

+
 # counting NAs in AF, OR cols: 
 if (identical(sum(is.na(my_df_u$or_mychisq))
              , sum(is.na(my_df_u$pval_fisher))
@ -230,9 +245,9 @@ if (identical(sum(is.na(merged_df3$or_kin))
 if ( identical( which(is.na(merged_df2$or_mychisq)), which(is.na(merged_df2$or_kin)))
  && identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin)))
  && identical( which(is.na(merged_df2$pval_fisher)), which(is.na(merged_df2$pwald_kin))) ){
-  cat('PASS: Indices match for mychisq and kin ors missing values')
+  cat("PASS: Indices match for mychisq and kin ors missing values")
 } else{
-  cat('Index mismatch: mychisq and kin ors missing indices match')
+  cat("Index mismatch: mychisq and kin ors missing indices match")
  quit()
 }

@ -245,7 +260,7 @@ cat("Merging dfs without any NAs: big df (1-many relationship b/w id & mut)"
    ,"\nfilename: merged_df2_comp")

 if ( identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin))) ){
-  print('mychisq and kin ors missing indices match. Procedding with omitting NAs')
+  print("mychisq and kin ors missing indices match. Procedding with omitting NAs")
  na_count_df2 = sum(is.na(merged_df2$af))
  merged_df2_comp = merged_df2[!is.na(merged_df2$af),] 
  # sanity check: no +-1 gymnastics
@ -262,7 +277,7 @@ if ( identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin))) ){
        ,"\nGot no. of rows: ", nrow(merged_df2_comp))
  }
 }else{
-  print('Index mismatch for mychisq and kin ors. Aborting NA ommission')
+  print("Index mismatch for mychisq and kin ors. Aborting NA ommission")
 }

 #=========================
@ -272,7 +287,7 @@ if ( identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin))) ){
 #=========================

 if ( identical( which(is.na(merged_df3$af)), which(is.na(merged_df3$af_kin))) ){
-  print('mychisq and kin ors missing indices match. Procedding with omitting NAs')
+  print("mychisq and kin ors missing indices match. Procedding with omitting NAs")
  na_count_df3 = sum(is.na(merged_df3$af))
  #merged_df3_comp = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] # a way
  merged_df3_comp = merged_df3[!is.na(merged_df3$af),] # another way
@ -289,7 +304,7 @@ if ( identical( which(is.na(merged_df3$af)), which(is.na(merged_df3$af_kin))) ){
        ,"\nGot no. of rows: ", nrow(merged_df3_comp))
   }
 } else{
-  print('Index mismatch for mychisq and kin ors. Aborting NA ommission')
+  print("Index mismatch for mychisq and kin ors. Aborting NA ommission")
 } 
  
 # alternate way of deriving merged_df3_comp
@ -347,7 +362,7 @@ merged_df3_comp_lig = merged_df3_comp[merged_df3_comp$ligand_distance<10,]
 if (nrow(merged_df3_lig) == nrow(my_df_u_lig)){
  print("PASS: verified merged_df3_lig")
 }else{
-  cat(paste0('FAIL: nrow mismatch for merged_df3_lig'
+  cat(paste0("FAIL: nrow mismatch for merged_df3_lig"
             , "\nExpected:", nrow(my_df_u_lig)
             , "\nGot:", nrow(merged_df3_lig)))
 }