From dcd9a985ec6f06cf468febc69bb9b12a2b1761ea Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Fri, 7 Aug 2020 13:35:02 +0100 Subject: [PATCH] saving work, ready for more remote working --- scripts/pre_data_extraction.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/scripts/pre_data_extraction.py b/scripts/pre_data_extraction.py index 98ea5d9..876299b 100755 --- a/scripts/pre_data_extraction.py +++ b/scripts/pre_data_extraction.py @@ -106,6 +106,11 @@ in_filename_v4 = 'mtb_gwas_meta_v4.csv' #34k infile_master_v4 = datadir + '/' + in_filename_v4 print('Input file v4: ', infile_master_v4 , '\n============================================================') + +in_filename_v5 = 'mtb_gwas_meta_v5.csv' #34k +infile_master_v5 = datadir + '/' + in_filename_v5 +print('Input file v4: ', infile_master_v5 + , '\n============================================================') #======= # output @@ -120,7 +125,8 @@ print('Output filename: in the respective sections' #%% Read input file master_data_v2 = pd.read_csv(infile_master_v2, sep = ',', dtype = 'unicode') # ascii master_data_v3 = pd.read_csv(infile_master_v3, sep = ',', dtype = 'unicode') -master_data_v4 = pd.read_csv(infile_master_v4, sep = ',', dtype = 'unicode') +master_data_v4 = pd.read_csv(infile_master_v4, sep = ',', dtype = 'unicode') +master_data_v5 = pd.read_csv(infile_master_v5, sep = ',', dtype = 'unicode') #DtypeWarning: Columns (48) have mixed types.Specify dtype option on import or set low_memory=False. # interactivity=interactivity, compiler=compiler, result=result) @@ -188,6 +194,26 @@ v4 = master_data_v4[['id' v4.isna().sum() print('complete samples v4:', v4['id'].nunique() - v4[drug].isna().sum()) +#================================================================== +v5_na = master_data_v5.isna().sum() +v5_na.name = "v5_na_count" +v5_na = v5_na.to_frame() +v5_na['v4_na_percent'] = master_data_v5.isna().mean().round(4)*100 + +v5 = master_data_v5[['id' +, 'country' +, 'lineage' +, 'sublineage' +, drug +, dr_muts_col +, other_muts_col]] + +v5.isna().sum() + +print('complete samples v5:', v5['id'].nunique() - v5[drug].isna().sum()) + + + #==================================================================== # checking ids