saving work before adding files

2021-06-18 17:47:09 +01:00 · 2021-06-18 17:47:09 +01:00 · 926d181120
commit 926d181120
parent 0e0f7c89df
3 changed files with 47 additions and 24 deletions
--- a/scripts/combining_dfs.py
+++ b/scripts/combining_dfs.py
@ -54,13 +54,26 @@ os.getcwd()
 # FIXME: local imports
 #from combining import combine_dfs_with_checks
 from combining_FIXME import detect_common_cols
-from reference_dict import oneletter_aa_dict # CHECK DIR STRUC THERE!
-from reference_dict import low_3letter_dict # CHECK DIR STRUC THERE!
+from reference_dict import oneletter_aa_dict 
+from reference_dict import low_3letter_dict 
+
+from aa_code import get_aa_3lower
+from aa_code import get_aa_1upper
+
+# REGEX: as required
+# mcsm_regex = r'^([A-Za-z]{1})([0-9]+)([A-Za-z]{1})$'
+# mcsm_wt = mcsm_df['mutationinformation'].str.extract(mcsm_regex)[0]
+# mcsm_mut = mcsm_df['mutationinformation'].str.extract(mcsm_regex)[2]
+
+# gwas_regex = r'^([A-Za-z]{3})([0-9]+)([A-Za-z]{3})$'
+# gwas_wt = mcsm_df['mutation'].str.extract(gwas_regex)[0]
+# gwas_pos = mcsm_df['mutation'].str.extract(gwas_regex)[1]
+# gwas_mut = mcsm_df['mutation'].str.extract(gwas_regex)[2]
 #=======================================================================
 #%% command line args: case sensitive
 arg_parser = argparse.ArgumentParser()
-arg_parser.add_argument('-d', '--drug', help='drug name', default = '')
-arg_parser.add_argument('-g', '--gene', help='gene name', default = '') 
+arg_parser.add_argument('-d', '--drug', help = 'drug name', default = '')
+arg_parser.add_argument('-g', '--gene', help = 'gene name', default = '') 

 arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
 arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
@ -83,17 +96,19 @@ outdir  = args.output_dir
 gene_match = gene + '_p.'
 print('mut pattern for gene', gene, ':',  gene_match)

-nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}'
-print('nsSNP for gene', gene, ':',  nssnp_match)
+# !"Redundant, now that improvements have been made!
+# See section "REGEX"
+# nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}'
+# print('nsSNP for gene', gene, ':',  nssnp_match)

-wt_regex = gene_match.lower()+'([A-Za-z]{3})'
-print('wt regex:', wt_regex)
+# wt_regex = gene_match.lower()+'([A-Za-z]{3})'
+# print('wt regex:', wt_regex)

-mut_regex = r'[0-9]+(\w{3})$'
-print('mt regex:', mut_regex)
+# mut_regex = r'[0-9]+(\w{3})$'
+# print('mt regex:', mut_regex)

-pos_regex = r'([0-9]+)'
-print('position regex:', pos_regex)
+# pos_regex = r'([0-9]+)'
+# print('position regex:', pos_regex)
 #%%=======================================================================
 #==============
 # directories
@ -168,6 +183,14 @@ print('==================================='
      , '\n===================================')

 mcsm_df =  pd.read_csv(infile_mcsm, sep = ',')
+
+# add 3 lowercase aa code for wt and mutant
+get_aa_3lower(df = mcsm_df
+              , wt_colname = 'wild_type'
+              , mut_colname = 'mutant_type'
+              , col_wt = 'wt_aa_3lower'
+              , col_mut = 'mut_aa_3lower')
+
 #mcsm_df.columns = mcsm_df.columns.str.lower()
 foldx_df =  pd.read_csv(infile_foldx , sep = ',')

@ -201,8 +224,9 @@ print('==================================='
      , '\ndssp_kd_dfs + rd_df'
      , '\n===================================')
 #dssp_kd_rd_dfs = combine_dfs_with_checks(dssp_kd_dfs, rd_df, my_join = o_join)
-merging_cols_m3 = detect_common_cols(dssp_df, kd_df)
-dssp_kd_rd_dfs = pd.merge(dssp_kd_dfs, rd_df, on = merging_cols_m3, how = o_join)
+merging_cols_m3 = detect_common_cols(dssp_kd_dfs,  rd_df)
+dssp_kd_rd_dfs = pd.merge(dssp_kd_dfs, rd_df, on = merging_cols_m3
+                          , how = o_join)

 ncols_m3 = len(dssp_kd_rd_dfs.columns)