various changes

2020-09-08 17:13:02 +01:00 · 2020-09-08 17:13:02 +01:00 · e4608342a4
commit e4608342a4
parent c72269dcd1
3 changed files with 199 additions and 95 deletions
--- a/scripts/data_extraction.py
+++ b/scripts/data_extraction.py
@ -81,16 +81,16 @@ gene = args.gene
 gene_match = gene + '_p.'
 print('mut pattern for gene', gene, ':',  gene_match)

-nssnp_match = gene_match +'[A-Z]{3}[0-9]+[A-Z]{3}'
+nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}'
 print('nsSNP for gene', gene, ':',  nssnp_match)

-wt_regex = gene_match.lower()+'(\w{3})'
+wt_regex = gene_match.lower()+'([A-Za-z]{3})'
 print('wt regex:', wt_regex)

-mut_regex = r'\d+(\w{3})$'
+mut_regex = r'[0-9]+(\w{3})$'
 print('mt regex:', mut_regex)

-pos_regex = r'(\d+)'
+pos_regex = r'([0-9]+)'
 print('position regex:', pos_regex)

 # building cols to extract
@ -154,30 +154,29 @@ if in_filename_master == 'original_tanushree_data_v2.csv':
 else:
    core_cols = ['id'
                 , 'sample'
-                 , 'patient_id'
-                 , 'strain'
+                 #, 'patient_id'
+                 #, 'strain'
                 , 'lineage'
                 , 'sublineage' 
-                 , 'country'
+                 #, 'country'
                 , 'country_code'
                 , 'geographic_source'
                 #, 'region'
-                 , 'location'
-                 , 'host_body_site'
-                 , 'environment_material'
-                 , 'host_status'
-                 , 'host_sex'
-                 , 'submitted_host_sex'
-                 , 'hiv_status'
-                 , 'HIV_status'
-                 , 'tissue_type'
-                 , 'isolation_source'
+                 #, 'location'
+                 #, 'host_body_site'
+                 #, 'environment_material'
+                 #, 'host_status'
+                 #, 'host_sex'
+                 #, 'submitted_host_sex'
+                 #, 'hiv_status'
+                 #, 'HIV_status'
+                 #, 'tissue_type'
+                 #, 'isolation_source'
                 ,  resistance_col]
    
    variable_based_cols = [drug
                           , dr_muts_col
                           , other_muts_col]
-                           #, resistance_col]
       
    cols_to_extract = core_cols + variable_based_cols
    print('Extracting', len(cols_to_extract), 'columns from master data')
@ -200,7 +199,7 @@ print('No. of NAs/column:' + '\n', meta_data.isna().sum()

 #%% Write check file
 check_file = outdir + '/' + gene.lower() + '_gwas.csv'
-meta_data.to_csv(check_file)
+meta_data.to_csv(check_file, index = False)
 print('Writing subsetted gwas data'
      , '\nFile', check_file
      , '\nDim:', meta_data.shape)
@ -215,9 +214,9 @@ print('Writing subsetted gwas data'
 # drug counts: complete samples for OR calcs
 meta_data[drug].value_counts() 
 print('===========================================================\n'
-      , 'RESULT: No. of Sus and Res samples:\n', meta_data[drug].value_counts()
+      , 'RESULT: No. of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts()
 	  , '\n===========================================================\n'
-      , 'RESULT: Percentage of Sus and Res samples:\n', meta_data[drug].value_counts(normalize = True)*100
+      , 'RESULT: Percentage of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts(normalize = True)*100
 		, '\n===========================================================')

 #%%