handled rpob 5uhc position offset in mcsm_ppi2

2022-01-04 10:45:29 +00:00 · 2022-01-04 10:45:29 +00:00 · 00b84ccb1c
commit 00b84ccb1c
parent 46e2c93885
30 changed files with 395 additions and 63 deletions
--- a/scripts/data_extraction_epistasis.py
+++ b/scripts/data_extraction_epistasis.py
@ -75,15 +75,14 @@ args = arg_parser.parse_args()
 drug = args.drug
 gene = args.gene

-#drug = 'pyrazinamide'
-#gene = 'pncA'
-
 gene_match = gene + '_p.'
 print('mut pattern for gene', gene, ':',  gene_match)

 nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}'
 print('nsSNP for gene', gene, ':',  nssnp_match)

+nssnp_match2 = re.compile(nssnp_match)
+
 wt_regex = gene_match.lower()+'([A-Za-z]{3})'
 print('wt regex:', wt_regex)

@ -219,20 +218,21 @@ meta_gene_epi = meta_gene_multi.loc[(meta_gene_multi['dr_mult_snp_count']>1) | (

 #%% TEST
 # formatting, replace !nssnp_match  with nothing
-foo1 = 	'pncA_p.Thr47Pro;pncA_p.Thr61Pro;rpsA_c.XX'
-foo2 = 'pncA_Chromosome:g.2288693_2289280del; WT; pncA_p.Thr61Ala'
+#foo1 = 	'pncA_p.Thr47Pro;pncA_p.Thr61Pro;rpsA_c.XX'
+#foo2 = 'pncA_Chromosome:g.2288693_2289280del; WT; pncA_p.Thr61Ala'


-foo1_s = foo1.split(';')
-foo1_s
-nssnp_match2 = re.compile('(pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})')
-arse=list(filter(nssnp_match2.match, foo1_s))
-arse
+#foo1_s = foo1.split(';')
+#foo1_s
+#nssnp_match2 = re.compile('(pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})')
+#arse=list(filter(nssnp_match2.match, foo1_s))
+#arse
+
+#foo1_s2 = ';'.join(arse)
+#foo1_s2

-foo1_s2 = ';'.join(arse)
-foo1_s2
 #%%
-nssnp_match2 = re.compile('(pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})')
+#nssnp_match2 = re.compile('(pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})')

 # dr_muts_col
 dr_clean_col = dr_muts_col + '_clean'
@ -248,6 +248,7 @@ for i, v in enumerate(meta_gene_epi[dr_muts_col]):
    dr2_s = v.split(';')
    print(dr2_s)
    dr2_sf = list(filter(nssnp_match2.match, dr2_s))
+    #dr2_sf = list(filter(nssnp_match.match, dr2_s))
    print(dr2_sf)
    dr2_sf2  = ';'.join(dr2_sf)
    meta_gene_epi[dr_clean_col].iloc[i] = dr2_sf2
@ -262,13 +263,13 @@ meta_gene_epi[other_clean_col] = ''

 for i, v in enumerate(meta_gene_epi[other_muts_col]):
    #print(i, v)
-    print('======================================================')
-    print(i)
-    print(v)
+    #print('======================================================')
+    #print(i)
+    #print(v)
    other2_s = v.split(';')
-    print(other2_s)
+    #print(other2_s)
    other2_sf = list(filter(nssnp_match2.match,  other2_s))
-    print(other2_sf)
+    #print(other2_sf)
    other2_sf2  = ';'.join(other2_sf)
    meta_gene_epi[other_clean_col].iloc[i] =  other2_sf2

@ -281,7 +282,8 @@ meta_gene_epi_f = meta_gene_epi[['id', 'sample'
                               , 'dr_mult_snp_count'
                               , other_muts_col, other_clean_col
                               , 'other_mult_snp_count']]
-meta_gene_epi_f.columns
+#print(meta_gene_epi_f.columns)
+print(meta_gene_epi_f)

 cols_to_output = ['id', 'sample'
                   , dr_clean_col
@ -293,7 +295,6 @@ cols_to_output = ['id', 'sample'
 meta_gene_epi_f2 = meta_gene_epi_f[cols_to_output]


-
 #%%
 # formatting, replace !nssnp_match  with nothing
 #nssnp_neg_match = '(?!pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})'