saving work, ready for more remote working

This commit is contained in:
Tanushree Tunstall 2020-08-07 13:35:02 +01:00
parent 13203e6fe0
commit dcd9a985ec

View file

@ -106,6 +106,11 @@ in_filename_v4 = 'mtb_gwas_meta_v4.csv' #34k
infile_master_v4 = datadir + '/' + in_filename_v4 infile_master_v4 = datadir + '/' + in_filename_v4
print('Input file v4: ', infile_master_v4 print('Input file v4: ', infile_master_v4
, '\n============================================================') , '\n============================================================')
in_filename_v5 = 'mtb_gwas_meta_v5.csv' #34k
infile_master_v5 = datadir + '/' + in_filename_v5
print('Input file v4: ', infile_master_v5
, '\n============================================================')
#======= #=======
# output # output
@ -120,7 +125,8 @@ print('Output filename: in the respective sections'
#%% Read input file #%% Read input file
master_data_v2 = pd.read_csv(infile_master_v2, sep = ',', dtype = 'unicode') # ascii master_data_v2 = pd.read_csv(infile_master_v2, sep = ',', dtype = 'unicode') # ascii
master_data_v3 = pd.read_csv(infile_master_v3, sep = ',', dtype = 'unicode') master_data_v3 = pd.read_csv(infile_master_v3, sep = ',', dtype = 'unicode')
master_data_v4 = pd.read_csv(infile_master_v4, sep = ',', dtype = 'unicode') master_data_v4 = pd.read_csv(infile_master_v4, sep = ',', dtype = 'unicode')
master_data_v5 = pd.read_csv(infile_master_v5, sep = ',', dtype = 'unicode')
#DtypeWarning: Columns (48) have mixed types.Specify dtype option on import or set low_memory=False. #DtypeWarning: Columns (48) have mixed types.Specify dtype option on import or set low_memory=False.
# interactivity=interactivity, compiler=compiler, result=result) # interactivity=interactivity, compiler=compiler, result=result)
@ -188,6 +194,26 @@ v4 = master_data_v4[['id'
v4.isna().sum() v4.isna().sum()
print('complete samples v4:', v4['id'].nunique() - v4[drug].isna().sum()) print('complete samples v4:', v4['id'].nunique() - v4[drug].isna().sum())
#==================================================================
v5_na = master_data_v5.isna().sum()
v5_na.name = "v5_na_count"
v5_na = v5_na.to_frame()
v5_na['v4_na_percent'] = master_data_v5.isna().mean().round(4)*100
v5 = master_data_v5[['id'
, 'country'
, 'lineage'
, 'sublineage'
, drug
, dr_muts_col
, other_muts_col]]
v5.isna().sum()
print('complete samples v5:', v5['id'].nunique() - v5[drug].isna().sum())
#==================================================================== #====================================================================
# checking ids # checking ids