saving previous work from home pc

2020-01-23 09:29:20 +00:00 · 2020-01-23 09:29:20 +00:00 · be213cb7e9
commit be213cb7e9
parent cae9c550a4
3 changed files with 146 additions and 106 deletions
--- a/meta_data_analysis/pycache/reference_dict.cpython-37.pyc
+++ b/meta_data_analysis/pycache/reference_dict.cpython-37.pyc
--- a/meta_data_analysis/pnca_data_extraction.py
+++ b/meta_data_analysis/pnca_data_extraction.py
@ -23,11 +23,7 @@ import pandas as pd
 #my_dir = os.path.expanduser('~/some_dir')
 #make sure mcsm_analysis/ exists
 #or specify the output directory
-
+#====================================================
 #%%
 #%%
 #%%
 #========================================================
 # TASK: extract ALL pncA mutations from GWAS data
 #========================================================
 #%%
@ -40,83 +36,45 @@ os.getcwd()
 #%%
 from reference_dict import my_aa_dict #CHECK DIR STRUC THERE!
 #%%
-#NOTE: Out_dir MUST exis
+############# specify variables for input and output paths and filenames
-# User defined dir strpyrazinamide
+drug = "pyrazinamide"
-drug = 'pyrazinamide'
+gene = "pnca"
 gene = 'pnca'
 out_dir = homedir + '/git/LSHTM_analysis/mcsm_analysis/'
 # = out_dir + drug
 data_dir = homedir + '/git/Data'
-if not os.path.exists(data_dir):
+datadir = homedir + "/git/Data"
-    print('Error!', data_dir, 'does not exist. Please ensure it exists and contains the appropriate raw data')
+basedir =  datadir + "/" + drug + "/input"
-    os.makedirs(data_dir)
+
 # input
 inpath = "/original"
 in_filename  = "/original_tanushree_data_v2.csv"
 infile = basedir + inpath + in_filename
 #print(infile)
 # output: several output files
 # output filenames in respective sections at the time of outputting files
 outpath = "/processed"
 outdir = basedir + outpath
 #print(outdir)
 if not os.path.exists(datadir):
    print('Error!', datadir, 'does not exist. Please ensure it exists. Dir struc specified in README.md')
    os.makedirs(datadir)
    die()
-if not os.path.exists(out_dir):
+if not os.path.exists(outdir):
-    print('Error!', out_dir, 'does not exist. Please create it')
+    print('Error!', outdir, 'does not exist.Please ensure it exists. Dir struc specified in README.md')
    exit()
 #if not os.path.exists(work_dir):
 #    print('creating dir that does not exist', 'dir_name:', work_dir)
 #    os.makedirs(work_dir)
 else:
    print('Dir exists: Carrying on')
-# now create sub dir structure within work_dir
+################## end of variable assignment for input and output files
 # pyrazinamide/mcsm_analysis
 # we need three dir
 # Data
 # Scripts
    # Plotting
 # Results
    # Plots
 # create a list of dir names
 #dir_names = ['Data', 'Scripts', 'Results']
 #for i in dir_names:
 #    this_dir = (work_dir + '/' + i)
 #    if not os.path.exists(this_dir):
 #        print('creating dir that does not exist:', this_dir)
 #        os.makedirs(this_dir)
 #else:
 #    print('Dir exists: Carrying on')
 # Create sub dirs
 # 1)        
 # Scripts
    # Plotting
 #subdir_plotting = work_dir + '/Scripts/Plotting'
 #if not os.path.exists(subdir_plotting):
 #      print('creating dir that does not exist:', subdir_plotting)
 #      os.makedirs(subdir_plotting)
 #else:
 #    print('Dir exists: Carrying on')
 # 2)    
 # Results
    # Plots
 #subdir_plots = work_dir + '/Results/Plots'        
 #if not os.path.exists(subdir_plots):
 #        print('creating dir that does not exist:', subdir_plots)
 #        os.makedirs(subdir_plots)    
 #else:
 #    print('Dir exists: Carrying on')
 # clear varaibles
 #del(dir_names, drug, i, subdir_plots, subdir_plotting)
 #exit()
 #%%
 #==============================================================================
 ############
 # STEP 1: Read file original_tanushree_data_v2.csv
 ############
-data_file = data_dir + '/input/original/original_tanushree_data_v2.csv'
+#data_file = data_dir + '/input/original/original_tanushree_data_v2.csv'
-meta_data = pd.read_csv(data_file, sep = ',') 
+meta_data = pd.read_csv(infile, sep = ',')  
 # column names
 list(meta_data.columns)
@ -130,7 +88,7 @@ meta_data = meta_data[['id'
       , 'pyrazinamide'
       , 'dr_mutations_pyrazinamide'
       , 'other_mutations_pyrazinamide'
-        ]] 
+        ]] #19265, 67
 # checks
 total_samples = meta_data['id'].nunique() # 19265
@ -143,7 +101,12 @@ meta_data.head()
 # equivalent of table in R
 # pyrazinamide counts
-meta_data.pyrazinamide.value_counts() 
+meta_data.pyrazinamide.value_counts() #12511
 #0.0    10565
 #1.0     1946 {RESULT: No. of Resistant and Susceptible samples}
 clear variables
 #del(basedir, datadir, inpath, infile)
 #%%
 ############
@ -155,7 +118,16 @@ meta_data.pyrazinamide.value_counts()
 # and drop the entries with NA
 #############
 meta_pza = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)] 
 #2163 {RESULT: samples with dr_muts}
 dr_id = meta_pza['id'].unique()
 meta_pza = meta_data.loc[meta_data.other_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)] 
 #526 (RESULT: samples with other_muts)
 other_id = meta_pza['id'].unique()
 # FIXME: See if the sample ids are unique in each
 # find any common IDs
 dr_id.isin(other_id[1,1])
 del(meta_pza)
@ -188,8 +160,9 @@ del(meta_pnca_other)
 # Now extract "all" mutations
 meta_pnca_all = meta_data_pnca.loc[meta_data_pnca.dr_mutations_pyrazinamide.str.contains('pncA_p.*') | meta_data_pnca.other_mutations_pyrazinamide.str.contains('pncA_p.*') ]
 #2665, 8
-meta_pnca_all['id'].nunique() 
+meta_pnca_all['id'].nunique() {#RESULT: pnca mutations in ALL samples}
 pnca_samples = len(meta_pnca_all)
 pnca_na = meta_pnca_all['pyrazinamide'].isna().sum() 
 comp_pnca_samples = pnca_samples - pnca_na 
@ -468,8 +441,11 @@ meta_pnca_LF1['Mutationinformation'] = meta_pnca_LF1['wild_type'] + meta_pnca_LF
 #=========
 # Step 12a: all SNPs to run mCSM
 #=========
-snps_only = pd.DataFrame(meta_pnca_LF1['Mutationinformation'].unique()) 
+snps_only = pd.DataFrame(meta_pnca_LF1['Mutationinformation'].unique()) #336
-pos_only = pd.DataFrame(meta_pnca_LF1['position'].unique()) 
+pos_only = pd.DataFrame(meta_pnca_LF1['position'].unique()) #131
 # since 186 is not part of struc: it is one less
 #FIXME!
 # assign meaningful colnames 
 #snps_only.rename({0 : 'all_pnca_snps'}, axis = 1, inplace = True)
@ -480,24 +456,27 @@ snps_only.isna().sum() # should be 0
 # specify variable name for output file
 gene = 'pnca'
 #drug = 'pyrazinamide'
-my_fname1 = '_snps_'
+my_fname1 = '_snps'
 nrows = len(snps_only) 
 #output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
 #output_file_path = work_dir + '/Data/'
-output_file_path = data_dir + '/input/processed/' + drug + '/'
+#output_file_path = data_dir + '/input/processed/' + drug + '/'
 print(outdir)
-if not os.path.exists(output_file_path):
+if not os.path.exists(outdir):
-    print( output_file_path, 'does not exist. Creating')
+    print( outdir, 'does not exist. Creating')
-    os.makedirs(output_file_path)
+    os.makedirs(outdir)
    exit()
-output_filename = output_file_path + gene + my_fname1 + str(nrows) + '.csv'
+#out_filename = gene + my_fname1 + str(nrows) + '.csv'
-print(output_filename) #<<<- check
+out_filename = '/' + gene + my_fname1 + '.csv'
 outfile = outdir + out_filename
 print(outfile) #<<<- check
 # write to csv: without column or row names
 # Bad practice: numbers at the start of a filename
-snps_only.to_csv(output_filename, header = False, index = False)
+snps_only.to_csv(out_filename, header = False, index = False)
 #=========
 # Step 12b: all snps with annotation
@ -519,12 +498,23 @@ snps_only.to_csv(output_filename, header = False, index = False)
 #my_fname2 = '_snps_with_metadata_'
 #nrows = len(pnca_snps_ALL) 
 #output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
 #output_file_path = work_dir + '/Data/'
-#output_filename = output_file_path + gene + my_fname2 + str(nrows) + '.csv'
+#output_file_path = data_dir + '/input/processed/' + drug + '/'
-#print(output_filename)  #<<<- check
+print(outdir)
 if not os.path.exists(outdir):
    print( outdir, 'does not exist. Creating')
    os.makedirs(outdir)
    exit()
 #out_filename = gene + my_fname2 + str(nrows) + '.csv'
 out_filename = '/' + gene + my_fname1 + '.csv'
 outfile = outdir + out_filename
 print(outfile) #<<<- check
 # write out file
-#pnca_snps_ALL.to_csv(output_filename, header = True, index = False)
+#pnca_snps_ALL.to_csv(outfile, header = True, index = False)
 #=========
 # Step 12c: comp snps for OR calcs with annotation
@ -547,7 +537,7 @@ meta_pnca_LF2['mutation'].nunique()
 meta_pnca_LF2.groupby('mutation_info').nunique() 
 # sanity check
-meta_pnca_LF2['id'].nunique() 
+meta_pnca_LF2['id'].nunique()  #1908
 # should be True
 if meta_pnca_LF2['id'].nunique() == comp_pnca_samples:
@ -569,15 +559,26 @@ len(pnca_snps_COMP)
 gene = 'pnca'
 #drug = 'pyrazinamide'
-my_fname3 = '_comp_snps_with_metadata_'
+my_fname3 = '_comp_snps_with_metadata'
 nrows = len(pnca_snps_COMP) 
-#output_filename = output_file_path + gene + my_fname3 + str(nrows) + '.csv'
+#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
-#print(output_filename) #<<<-check
+#output_file_path = work_dir + '/Data/'
 #output_file_path = data_dir + '/input/processed/' + drug + '/'
 print(outdir)
 if not os.path.exists(outdir):
    print( outdir, 'does not exist. Creating')
    os.makedirs(outdir)
    exit()
 #out_filename = gene + my_fname3 + str(nrows) + '.csv'
 #out_filename = '/' + gene + my_fname3 + '.csv'
 #outfile = outdir + out_filename
 #print(outfile) #<<<- check
 # write out file
-#pnca_snps_COMP.to_csv(output_filename, header = True, index = False)
+#pnca_snps_COMP.to_csv(outfile, header = True, index = False)
 #=========
 # Step 12d: comp snps only
@ -589,15 +590,26 @@ snps_only = pd.DataFrame(meta_pnca_LF2['Mutationinformation'].unique())
 gene = 'pnca'
 #drug = 'pyrazinamide'
-my_fname1 = '_comp_snps_'
+my_fname1 = '_comp_snps'
 nrows = len(snps_only) 
-output_filename = output_file_path + gene + my_fname1  + str(nrows) + '.csv'
+#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
-print(output_filename) #<<<- check
+#output_file_path = work_dir + '/Data/'
 #output_file_path = data_dir + '/input/processed/' + drug + '/'
 print(outdir)
 if not os.path.exists(outdir):
    print( outdir, 'does not exist. Creating')
    os.makedirs(outdir)
    exit()
 #out_filename = gene + my_fname1 + str(nrows) + '.csv'
 out_filename = '/' + gene + my_fname1 + '.csv'
 outfile = outdir + out_filename
 print(outfile) #<<<- check
 # write to csv: without column or row names
-snps_only.to_csv(output_filename, header = False, index = False)
+snps_only.to_csv(outfile, header = False, index = False)
 #=#=#=#=#=#=#=#
 # COMMENT: LF1 is the file to extract all unique snps for mcsm 
@ -619,8 +631,21 @@ gene = 'pnca'
 #drug = 'pyrazinamide'
 my_fname4 = '_metadata'
 #nrows = len(meta_pnca_LF1)
-output_filename = output_file_path + gene + my_fname4  + '.csv'
+
-print(output_filename) #<<<-check
+#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
 #output_file_path = work_dir + '/Data/'
 #output_file_path = data_dir + '/input/processed/' + drug + '/'
 print(outdir)
 if not os.path.exists(outdir):
    print( outdir, 'does not exist. Creating')
    os.makedirs(outdir)
    exit()
 #out_filename = gene + my_fname1 + str(nrows) + '.csv'
 out_filename = '/' + gene + my_fname4 + '.csv'
 outfile = outdir + out_filename
 print(outfile) #<<<- check
 # write out file
-meta_pnca_LF1.to_csv(output_filename) 
+meta_pnca_LF1.to_csv(outfile) 
--- a/meta_data_analysis/reference_dict.py
+++ b/meta_data_analysis/reference_dict.py
@ -6,7 +6,7 @@ Created on Tue Jun 18 11:32:28 2019
@author: tanushree
 """
 ############################################
-#load libraries
+# load libraries
 import pandas as pd
 import os
 #############################################
@ -17,14 +17,29 @@ import os
 # containing GWAS data
 #!#########################!
 #print(os.getcwd()) 
 #homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
 #os.chdir(homedir + '/git/Data/pyrazinamide/input/original') 
 print(os.getcwd())
-homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
+
-os.chdir(homedir + '/git/Data/input/original') 
+#%%
-print(os.getcwd())
+############# specify variables for input and output paths and filenames
 drug = 'pyrazinamide'
 #gene = 'pnca'
 datadir = homedir + '/git/Data'
 basedir =  datadir + '/' + drug + '/input'
 # input
 inpath = "/original"
 in_filename  = "/aa_codes.csv"
 infile = basedir + inpath + in_filename
 print(infile)
 #==========
 #read file
 #==========
-my_aa = pd.read_csv('aa_codes.csv') #20, 6
+my_aa = pd.read_csv(infile) #20, 6
 #assign the one_letter code as the row names so that it is easier to create a dict of dicts using index
 #my_aa = pd.read_csv('aa_codes.csv', index_col = 0) #20, 6  #a way to it since it is the first column
 my_aa = my_aa.set_index('three_letter_code_lower') #20, 5