various changes

This commit is contained in:
Tanushree Tunstall 2020-09-08 17:13:02 +01:00
parent c72269dcd1
commit e4608342a4
3 changed files with 199 additions and 95 deletions

View file

@ -81,16 +81,16 @@ gene = args.gene
gene_match = gene + '_p.'
print('mut pattern for gene', gene, ':', gene_match)
nssnp_match = gene_match +'[A-Z]{3}[0-9]+[A-Z]{3}'
nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}'
print('nsSNP for gene', gene, ':', nssnp_match)
wt_regex = gene_match.lower()+'(\w{3})'
wt_regex = gene_match.lower()+'([A-Za-z]{3})'
print('wt regex:', wt_regex)
mut_regex = r'\d+(\w{3})$'
mut_regex = r'[0-9]+(\w{3})$'
print('mt regex:', mut_regex)
pos_regex = r'(\d+)'
pos_regex = r'([0-9]+)'
print('position regex:', pos_regex)
# building cols to extract
@ -154,30 +154,29 @@ if in_filename_master == 'original_tanushree_data_v2.csv':
else:
core_cols = ['id'
, 'sample'
, 'patient_id'
, 'strain'
#, 'patient_id'
#, 'strain'
, 'lineage'
, 'sublineage'
, 'country'
#, 'country'
, 'country_code'
, 'geographic_source'
#, 'region'
, 'location'
, 'host_body_site'
, 'environment_material'
, 'host_status'
, 'host_sex'
, 'submitted_host_sex'
, 'hiv_status'
, 'HIV_status'
, 'tissue_type'
, 'isolation_source'
#, 'location'
#, 'host_body_site'
#, 'environment_material'
#, 'host_status'
#, 'host_sex'
#, 'submitted_host_sex'
#, 'hiv_status'
#, 'HIV_status'
#, 'tissue_type'
#, 'isolation_source'
, resistance_col]
variable_based_cols = [drug
, dr_muts_col
, other_muts_col]
#, resistance_col]
cols_to_extract = core_cols + variable_based_cols
print('Extracting', len(cols_to_extract), 'columns from master data')
@ -200,7 +199,7 @@ print('No. of NAs/column:' + '\n', meta_data.isna().sum()
#%% Write check file
check_file = outdir + '/' + gene.lower() + '_gwas.csv'
meta_data.to_csv(check_file)
meta_data.to_csv(check_file, index = False)
print('Writing subsetted gwas data'
, '\nFile', check_file
, '\nDim:', meta_data.shape)
@ -215,9 +214,9 @@ print('Writing subsetted gwas data'
# drug counts: complete samples for OR calcs
meta_data[drug].value_counts()
print('===========================================================\n'
, 'RESULT: No. of Sus and Res samples:\n', meta_data[drug].value_counts()
, 'RESULT: No. of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts()
, '\n===========================================================\n'
, 'RESULT: Percentage of Sus and Res samples:\n', meta_data[drug].value_counts(normalize = True)*100
, 'RESULT: Percentage of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts(normalize = True)*100
, '\n===========================================================')
#%%