handled rpob 5uhc position offset in mcsm_ppi2

This commit is contained in:
Tanushree Tunstall 2022-01-04 10:45:29 +00:00
parent 46e2c93885
commit 00b84ccb1c
30 changed files with 395 additions and 63 deletions

View file

@ -75,15 +75,14 @@ args = arg_parser.parse_args()
drug = args.drug
gene = args.gene
#drug = 'pyrazinamide'
#gene = 'pncA'
gene_match = gene + '_p.'
print('mut pattern for gene', gene, ':', gene_match)
nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}'
print('nsSNP for gene', gene, ':', nssnp_match)
nssnp_match2 = re.compile(nssnp_match)
wt_regex = gene_match.lower()+'([A-Za-z]{3})'
print('wt regex:', wt_regex)
@ -219,20 +218,21 @@ meta_gene_epi = meta_gene_multi.loc[(meta_gene_multi['dr_mult_snp_count']>1) | (
#%% TEST
# formatting, replace !nssnp_match with nothing
foo1 = 'pncA_p.Thr47Pro;pncA_p.Thr61Pro;rpsA_c.XX'
foo2 = 'pncA_Chromosome:g.2288693_2289280del; WT; pncA_p.Thr61Ala'
#foo1 = 'pncA_p.Thr47Pro;pncA_p.Thr61Pro;rpsA_c.XX'
#foo2 = 'pncA_Chromosome:g.2288693_2289280del; WT; pncA_p.Thr61Ala'
foo1_s = foo1.split(';')
foo1_s
nssnp_match2 = re.compile('(pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})')
arse=list(filter(nssnp_match2.match, foo1_s))
arse
#foo1_s = foo1.split(';')
#foo1_s
#nssnp_match2 = re.compile('(pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})')
#arse=list(filter(nssnp_match2.match, foo1_s))
#arse
#foo1_s2 = ';'.join(arse)
#foo1_s2
foo1_s2 = ';'.join(arse)
foo1_s2
#%%
nssnp_match2 = re.compile('(pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})')
#nssnp_match2 = re.compile('(pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})')
# dr_muts_col
dr_clean_col = dr_muts_col + '_clean'
@ -248,6 +248,7 @@ for i, v in enumerate(meta_gene_epi[dr_muts_col]):
dr2_s = v.split(';')
print(dr2_s)
dr2_sf = list(filter(nssnp_match2.match, dr2_s))
#dr2_sf = list(filter(nssnp_match.match, dr2_s))
print(dr2_sf)
dr2_sf2 = ';'.join(dr2_sf)
meta_gene_epi[dr_clean_col].iloc[i] = dr2_sf2
@ -262,13 +263,13 @@ meta_gene_epi[other_clean_col] = ''
for i, v in enumerate(meta_gene_epi[other_muts_col]):
#print(i, v)
print('======================================================')
print(i)
print(v)
#print('======================================================')
#print(i)
#print(v)
other2_s = v.split(';')
print(other2_s)
#print(other2_s)
other2_sf = list(filter(nssnp_match2.match, other2_s))
print(other2_sf)
#print(other2_sf)
other2_sf2 = ';'.join(other2_sf)
meta_gene_epi[other_clean_col].iloc[i] = other2_sf2
@ -281,7 +282,8 @@ meta_gene_epi_f = meta_gene_epi[['id', 'sample'
, 'dr_mult_snp_count'
, other_muts_col, other_clean_col
, 'other_mult_snp_count']]
meta_gene_epi_f.columns
#print(meta_gene_epi_f.columns)
print(meta_gene_epi_f)
cols_to_output = ['id', 'sample'
, dr_clean_col
@ -293,7 +295,6 @@ cols_to_output = ['id', 'sample'
meta_gene_epi_f2 = meta_gene_epi_f[cols_to_output]
#%%
# formatting, replace !nssnp_match with nothing
#nssnp_neg_match = '(?!pncA_p.[A-Za-z]{3}[0-9]+[A-Za-z]{3})'