diff --git a/meta_data_analysis/__pycache__/reference_dict.cpython-37.pyc b/meta_data_analysis/__pycache__/reference_dict.cpython-37.pyc index 1db3c83..3c936f9 100644 Binary files a/meta_data_analysis/__pycache__/reference_dict.cpython-37.pyc and b/meta_data_analysis/__pycache__/reference_dict.cpython-37.pyc differ diff --git a/meta_data_analysis/pnca_data_extraction.py b/meta_data_analysis/pnca_data_extraction.py index 0eb5aab..e05c39d 100755 --- a/meta_data_analysis/pnca_data_extraction.py +++ b/meta_data_analysis/pnca_data_extraction.py @@ -23,11 +23,7 @@ import pandas as pd #my_dir = os.path.expanduser('~/some_dir') #make sure mcsm_analysis/ exists #or specify the output directory - -#%% -#%% -#%% -#======================================================== +#==================================================== # TASK: extract ALL pncA mutations from GWAS data #======================================================== #%% @@ -40,83 +36,45 @@ os.getcwd() #%% from reference_dict import my_aa_dict #CHECK DIR STRUC THERE! #%% -#NOTE: Out_dir MUST exis -# User defined dir strpyrazinamide -drug = 'pyrazinamide' -gene = 'pnca' -out_dir = homedir + '/git/LSHTM_analysis/mcsm_analysis/' -# = out_dir + drug -data_dir = homedir + '/git/Data' +############# specify variables for input and output paths and filenames +drug = "pyrazinamide" +gene = "pnca" -if not os.path.exists(data_dir): - print('Error!', data_dir, 'does not exist. Please ensure it exists and contains the appropriate raw data') - os.makedirs(data_dir) +datadir = homedir + "/git/Data" +basedir = datadir + "/" + drug + "/input" + +# input +inpath = "/original" +in_filename = "/original_tanushree_data_v2.csv" +infile = basedir + inpath + in_filename +#print(infile) + +# output: several output files +# output filenames in respective sections at the time of outputting files +outpath = "/processed" +outdir = basedir + outpath +#print(outdir) + +if not os.path.exists(datadir): + print('Error!', datadir, 'does not exist. Please ensure it exists. Dir struc specified in README.md') + os.makedirs(datadir) die() -if not os.path.exists(out_dir): - print('Error!', out_dir, 'does not exist. Please create it') +if not os.path.exists(outdir): + print('Error!', outdir, 'does not exist.Please ensure it exists. Dir struc specified in README.md') exit() -#if not os.path.exists(work_dir): -# print('creating dir that does not exist', 'dir_name:', work_dir) -# os.makedirs(work_dir) else: print('Dir exists: Carrying on') -# now create sub dir structure within work_dir -# pyrazinamide/mcsm_analysis - -# we need three dir -# Data -# Scripts - # Plotting -# Results - # Plots - -# create a list of dir names -#dir_names = ['Data', 'Scripts', 'Results'] - - -#for i in dir_names: -# this_dir = (work_dir + '/' + i) -# if not os.path.exists(this_dir): -# print('creating dir that does not exist:', this_dir) -# os.makedirs(this_dir) -#else: -# print('Dir exists: Carrying on') - -# Create sub dirs -# 1) -# Scripts - # Plotting -#subdir_plotting = work_dir + '/Scripts/Plotting' -#if not os.path.exists(subdir_plotting): -# print('creating dir that does not exist:', subdir_plotting) -# os.makedirs(subdir_plotting) -#else: -# print('Dir exists: Carrying on') - -# 2) -# Results - # Plots -#subdir_plots = work_dir + '/Results/Plots' -#if not os.path.exists(subdir_plots): -# print('creating dir that does not exist:', subdir_plots) -# os.makedirs(subdir_plots) -#else: -# print('Dir exists: Carrying on') - -# clear varaibles -#del(dir_names, drug, i, subdir_plots, subdir_plotting) - -#exit() +################## end of variable assignment for input and output files #%% #============================================================================== ############ # STEP 1: Read file original_tanushree_data_v2.csv ############ -data_file = data_dir + '/input/original/original_tanushree_data_v2.csv' -meta_data = pd.read_csv(data_file, sep = ',') +#data_file = data_dir + '/input/original/original_tanushree_data_v2.csv' +meta_data = pd.read_csv(infile, sep = ',') # column names list(meta_data.columns) @@ -130,7 +88,7 @@ meta_data = meta_data[['id' , 'pyrazinamide' , 'dr_mutations_pyrazinamide' , 'other_mutations_pyrazinamide' - ]] + ]] #19265, 67 # checks total_samples = meta_data['id'].nunique() # 19265 @@ -143,7 +101,12 @@ meta_data.head() # equivalent of table in R # pyrazinamide counts -meta_data.pyrazinamide.value_counts() +meta_data.pyrazinamide.value_counts() #12511 +#0.0 10565 +#1.0 1946 {RESULT: No. of Resistant and Susceptible samples} + +clear variables +#del(basedir, datadir, inpath, infile) #%% ############ @@ -154,8 +117,17 @@ meta_data.pyrazinamide.value_counts() # (corresponding to a structure) # and drop the entries with NA ############# -meta_pza = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)] -meta_pza = meta_data.loc[meta_data.other_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)] +meta_pza = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)] +#2163 {RESULT: samples with dr_muts} +dr_id = meta_pza['id'].unique() + +meta_pza = meta_data.loc[meta_data.other_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)] +#526 (RESULT: samples with other_muts) +other_id = meta_pza['id'].unique() + +# FIXME: See if the sample ids are unique in each +# find any common IDs +dr_id.isin(other_id[1,1]) del(meta_pza) @@ -188,8 +160,9 @@ del(meta_pnca_other) # Now extract "all" mutations meta_pnca_all = meta_data_pnca.loc[meta_data_pnca.dr_mutations_pyrazinamide.str.contains('pncA_p.*') | meta_data_pnca.other_mutations_pyrazinamide.str.contains('pncA_p.*') ] +#2665, 8 -meta_pnca_all['id'].nunique() +meta_pnca_all['id'].nunique() {#RESULT: pnca mutations in ALL samples} pnca_samples = len(meta_pnca_all) pnca_na = meta_pnca_all['pyrazinamide'].isna().sum() comp_pnca_samples = pnca_samples - pnca_na @@ -468,8 +441,11 @@ meta_pnca_LF1['Mutationinformation'] = meta_pnca_LF1['wild_type'] + meta_pnca_LF #========= # Step 12a: all SNPs to run mCSM #========= -snps_only = pd.DataFrame(meta_pnca_LF1['Mutationinformation'].unique()) -pos_only = pd.DataFrame(meta_pnca_LF1['position'].unique()) +snps_only = pd.DataFrame(meta_pnca_LF1['Mutationinformation'].unique()) #336 +pos_only = pd.DataFrame(meta_pnca_LF1['position'].unique()) #131 + +# since 186 is not part of struc: it is one less +#FIXME! # assign meaningful colnames #snps_only.rename({0 : 'all_pnca_snps'}, axis = 1, inplace = True) @@ -480,24 +456,27 @@ snps_only.isna().sum() # should be 0 # specify variable name for output file gene = 'pnca' #drug = 'pyrazinamide' -my_fname1 = '_snps_' +my_fname1 = '_snps' nrows = len(snps_only) #output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/' #output_file_path = work_dir + '/Data/' -output_file_path = data_dir + '/input/processed/' + drug + '/' +#output_file_path = data_dir + '/input/processed/' + drug + '/' +print(outdir) -if not os.path.exists(output_file_path): - print( output_file_path, 'does not exist. Creating') - os.makedirs(output_file_path) +if not os.path.exists(outdir): + print( outdir, 'does not exist. Creating') + os.makedirs(outdir) exit() -output_filename = output_file_path + gene + my_fname1 + str(nrows) + '.csv' -print(output_filename) #<<<- check +#out_filename = gene + my_fname1 + str(nrows) + '.csv' +out_filename = '/' + gene + my_fname1 + '.csv' +outfile = outdir + out_filename +print(outfile) #<<<- check # write to csv: without column or row names # Bad practice: numbers at the start of a filename -snps_only.to_csv(output_filename, header = False, index = False) +snps_only.to_csv(out_filename, header = False, index = False) #========= # Step 12b: all snps with annotation @@ -519,12 +498,23 @@ snps_only.to_csv(output_filename, header = False, index = False) #my_fname2 = '_snps_with_metadata_' #nrows = len(pnca_snps_ALL) +#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/' #output_file_path = work_dir + '/Data/' -#output_filename = output_file_path + gene + my_fname2 + str(nrows) + '.csv' -#print(output_filename) #<<<- check +#output_file_path = data_dir + '/input/processed/' + drug + '/' +print(outdir) + +if not os.path.exists(outdir): + print( outdir, 'does not exist. Creating') + os.makedirs(outdir) + exit() + +#out_filename = gene + my_fname2 + str(nrows) + '.csv' +out_filename = '/' + gene + my_fname1 + '.csv' +outfile = outdir + out_filename +print(outfile) #<<<- check # write out file -#pnca_snps_ALL.to_csv(output_filename, header = True, index = False) +#pnca_snps_ALL.to_csv(outfile, header = True, index = False) #========= # Step 12c: comp snps for OR calcs with annotation @@ -547,7 +537,7 @@ meta_pnca_LF2['mutation'].nunique() meta_pnca_LF2.groupby('mutation_info').nunique() # sanity check -meta_pnca_LF2['id'].nunique() +meta_pnca_LF2['id'].nunique() #1908 # should be True if meta_pnca_LF2['id'].nunique() == comp_pnca_samples: @@ -569,15 +559,26 @@ len(pnca_snps_COMP) gene = 'pnca' #drug = 'pyrazinamide' -my_fname3 = '_comp_snps_with_metadata_' +my_fname3 = '_comp_snps_with_metadata' nrows = len(pnca_snps_COMP) -#output_filename = output_file_path + gene + my_fname3 + str(nrows) + '.csv' -#print(output_filename) #<<<-check +#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/' +#output_file_path = work_dir + '/Data/' +#output_file_path = data_dir + '/input/processed/' + drug + '/' +print(outdir) + +if not os.path.exists(outdir): + print( outdir, 'does not exist. Creating') + os.makedirs(outdir) + exit() + +#out_filename = gene + my_fname3 + str(nrows) + '.csv' +#out_filename = '/' + gene + my_fname3 + '.csv' +#outfile = outdir + out_filename +#print(outfile) #<<<- check # write out file -#pnca_snps_COMP.to_csv(output_filename, header = True, index = False) - +#pnca_snps_COMP.to_csv(outfile, header = True, index = False) #========= # Step 12d: comp snps only @@ -589,15 +590,26 @@ snps_only = pd.DataFrame(meta_pnca_LF2['Mutationinformation'].unique()) gene = 'pnca' #drug = 'pyrazinamide' -my_fname1 = '_comp_snps_' +my_fname1 = '_comp_snps' nrows = len(snps_only) -output_filename = output_file_path + gene + my_fname1 + str(nrows) + '.csv' -print(output_filename) #<<<- check +#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/' +#output_file_path = work_dir + '/Data/' +#output_file_path = data_dir + '/input/processed/' + drug + '/' +print(outdir) + +if not os.path.exists(outdir): + print( outdir, 'does not exist. Creating') + os.makedirs(outdir) + exit() + +#out_filename = gene + my_fname1 + str(nrows) + '.csv' +out_filename = '/' + gene + my_fname1 + '.csv' +outfile = outdir + out_filename +print(outfile) #<<<- check # write to csv: without column or row names -snps_only.to_csv(output_filename, header = False, index = False) - +snps_only.to_csv(outfile, header = False, index = False) #=#=#=#=#=#=#=# # COMMENT: LF1 is the file to extract all unique snps for mcsm @@ -619,8 +631,21 @@ gene = 'pnca' #drug = 'pyrazinamide' my_fname4 = '_metadata' #nrows = len(meta_pnca_LF1) -output_filename = output_file_path + gene + my_fname4 + '.csv' -print(output_filename) #<<<-check + +#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/' +#output_file_path = work_dir + '/Data/' +#output_file_path = data_dir + '/input/processed/' + drug + '/' +print(outdir) + +if not os.path.exists(outdir): + print( outdir, 'does not exist. Creating') + os.makedirs(outdir) + exit() + +#out_filename = gene + my_fname1 + str(nrows) + '.csv' +out_filename = '/' + gene + my_fname4 + '.csv' +outfile = outdir + out_filename +print(outfile) #<<<- check # write out file -meta_pnca_LF1.to_csv(output_filename) +meta_pnca_LF1.to_csv(outfile) diff --git a/meta_data_analysis/reference_dict.py b/meta_data_analysis/reference_dict.py index bf6561b..e83ce2c 100755 --- a/meta_data_analysis/reference_dict.py +++ b/meta_data_analysis/reference_dict.py @@ -6,7 +6,7 @@ Created on Tue Jun 18 11:32:28 2019 @author: tanushree """ ############################################ -#load libraries +# load libraries import pandas as pd import os ############################################# @@ -17,14 +17,29 @@ import os # containing GWAS data #!#########################! -print(os.getcwd()) -homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde -os.chdir(homedir + '/git/Data/input/original') +#print(os.getcwd()) +#homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde +#os.chdir(homedir + '/git/Data/pyrazinamide/input/original') print(os.getcwd()) + +#%% +############# specify variables for input and output paths and filenames +drug = 'pyrazinamide' +#gene = 'pnca' + +datadir = homedir + '/git/Data' +basedir = datadir + '/' + drug + '/input' + +# input +inpath = "/original" +in_filename = "/aa_codes.csv" +infile = basedir + inpath + in_filename +print(infile) + #========== #read file #========== -my_aa = pd.read_csv('aa_codes.csv') #20, 6 +my_aa = pd.read_csv(infile) #20, 6 #assign the one_letter code as the row names so that it is easier to create a dict of dicts using index #my_aa = pd.read_csv('aa_codes.csv', index_col = 0) #20, 6 #a way to it since it is the first column my_aa = my_aa.set_index('three_letter_code_lower') #20, 5