saving previous work from home pc

2020-01-23 09:29:20 +00:00 · 2020-01-23 09:29:20 +00:00 · 0ebd8a6d4b
commit 0ebd8a6d4b
parent c56a5e4497
3 changed files with 146 additions and 106 deletions
--- a/meta_data_analysis/pycache/reference_dict.cpython-37.pyc
+++ b/meta_data_analysis/pycache/reference_dict.cpython-37.pyc
--- a/meta_data_analysis/pnca_data_extraction.py
+++ b/meta_data_analysis/pnca_data_extraction.py
@ -23,11 +23,7 @@ import pandas as pd
 #my_dir = os.path.expanduser('~/some_dir')
 #make sure mcsm_analysis/ exists
 #or specify the output directory
-
-#%%
-#%%
-#%%
-#========================================================
+#====================================================
 # TASK: extract ALL pncA mutations from GWAS data
 #========================================================
 #%%
@ -40,83 +36,45 @@ os.getcwd()
 #%%
 from reference_dict import my_aa_dict #CHECK DIR STRUC THERE!
 #%%
-#NOTE: Out_dir MUST exis
-# User defined dir strpyrazinamide
-drug = 'pyrazinamide'
-gene = 'pnca'
-out_dir = homedir + '/git/LSHTM_analysis/mcsm_analysis/'
-# = out_dir + drug
-data_dir = homedir + '/git/Data'
+############# specify variables for input and output paths and filenames
+drug = "pyrazinamide"
+gene = "pnca"

-if not os.path.exists(data_dir):
-    print('Error!', data_dir, 'does not exist. Please ensure it exists and contains the appropriate raw data')
-    os.makedirs(data_dir)
+datadir = homedir + "/git/Data"
+basedir =  datadir + "/" + drug + "/input"
+
+# input
+inpath = "/original"
+in_filename  = "/original_tanushree_data_v2.csv"
+infile = basedir + inpath + in_filename
+#print(infile)
+
+# output: several output files
+# output filenames in respective sections at the time of outputting files
+outpath = "/processed"
+outdir = basedir + outpath
+#print(outdir)
+
+if not os.path.exists(datadir):
+    print('Error!', datadir, 'does not exist. Please ensure it exists. Dir struc specified in README.md')
+    os.makedirs(datadir)
    die()

-if not os.path.exists(out_dir):
-    print('Error!', out_dir, 'does not exist. Please create it')
+if not os.path.exists(outdir):
+    print('Error!', outdir, 'does not exist.Please ensure it exists. Dir struc specified in README.md')
    exit()
    
-#if not os.path.exists(work_dir):
-#    print('creating dir that does not exist', 'dir_name:', work_dir)
-#    os.makedirs(work_dir)
 else:
    print('Dir exists: Carrying on')

-# now create sub dir structure within work_dir
-# pyrazinamide/mcsm_analysis
-
-# we need three dir
-# Data
-# Scripts
-    # Plotting
-# Results
-    # Plots
-    
-# create a list of dir names
-#dir_names = ['Data', 'Scripts', 'Results']
-
-
-#for i in dir_names:
-#    this_dir = (work_dir + '/' + i)
-#    if not os.path.exists(this_dir):
-#        print('creating dir that does not exist:', this_dir)
-#        os.makedirs(this_dir)
-#else:
-#    print('Dir exists: Carrying on')
-      
-# Create sub dirs
-# 1)        
-# Scripts
-    # Plotting
-#subdir_plotting = work_dir + '/Scripts/Plotting'
-#if not os.path.exists(subdir_plotting):
-#      print('creating dir that does not exist:', subdir_plotting)
-#      os.makedirs(subdir_plotting)
-#else:
-#    print('Dir exists: Carrying on')
- 
-# 2)    
-# Results
-    # Plots
-#subdir_plots = work_dir + '/Results/Plots'        
-#if not os.path.exists(subdir_plots):
-#        print('creating dir that does not exist:', subdir_plots)
-#        os.makedirs(subdir_plots)    
-#else:
-#    print('Dir exists: Carrying on')
-         
-# clear varaibles
-#del(dir_names, drug, i, subdir_plots, subdir_plotting)
-
-#exit()
+################## end of variable assignment for input and output files
 #%%
 #==============================================================================
 ############
 # STEP 1: Read file original_tanushree_data_v2.csv
 ############
-data_file = data_dir + '/input/original/original_tanushree_data_v2.csv'
-meta_data = pd.read_csv(data_file, sep = ',') 
+#data_file = data_dir + '/input/original/original_tanushree_data_v2.csv'
+meta_data = pd.read_csv(infile, sep = ',')  

 # column names
 list(meta_data.columns)
@ -130,7 +88,7 @@ meta_data = meta_data[['id'
       , 'pyrazinamide'
       , 'dr_mutations_pyrazinamide'
       , 'other_mutations_pyrazinamide'
-        ]] 
+        ]] #19265, 67

 # checks
 total_samples = meta_data['id'].nunique() # 19265
@ -143,7 +101,12 @@ meta_data.head()

 # equivalent of table in R
 # pyrazinamide counts
-meta_data.pyrazinamide.value_counts() 
+meta_data.pyrazinamide.value_counts() #12511
+#0.0    10565
+#1.0     1946 {RESULT: No. of Resistant and Susceptible samples}
+
+clear variables
+#del(basedir, datadir, inpath, infile)

 #%%
 ############
@ -154,8 +117,17 @@ meta_data.pyrazinamide.value_counts()
 # (corresponding to a structure)
 # and drop the entries with NA
 #############
-meta_pza = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
-meta_pza = meta_data.loc[meta_data.other_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
+meta_pza = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)] 
+#2163 {RESULT: samples with dr_muts}
+dr_id = meta_pza['id'].unique()
+
+meta_pza = meta_data.loc[meta_data.other_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)] 
+#526 (RESULT: samples with other_muts)
+other_id = meta_pza['id'].unique()
+
+# FIXME: See if the sample ids are unique in each
+# find any common IDs
+dr_id.isin(other_id[1,1])

 del(meta_pza)

@ -188,8 +160,9 @@ del(meta_pnca_other)

 # Now extract "all" mutations
 meta_pnca_all = meta_data_pnca.loc[meta_data_pnca.dr_mutations_pyrazinamide.str.contains('pncA_p.*') | meta_data_pnca.other_mutations_pyrazinamide.str.contains('pncA_p.*') ]
+#2665, 8

-meta_pnca_all['id'].nunique() 
+meta_pnca_all['id'].nunique() {#RESULT: pnca mutations in ALL samples}
 pnca_samples = len(meta_pnca_all)
 pnca_na = meta_pnca_all['pyrazinamide'].isna().sum() 
 comp_pnca_samples = pnca_samples - pnca_na 
@ -468,8 +441,11 @@ meta_pnca_LF1['Mutationinformation'] = meta_pnca_LF1['wild_type'] + meta_pnca_LF
 #=========
 # Step 12a: all SNPs to run mCSM
 #=========
-snps_only = pd.DataFrame(meta_pnca_LF1['Mutationinformation'].unique()) 
-pos_only = pd.DataFrame(meta_pnca_LF1['position'].unique()) 
+snps_only = pd.DataFrame(meta_pnca_LF1['Mutationinformation'].unique()) #336
+pos_only = pd.DataFrame(meta_pnca_LF1['position'].unique()) #131
+
+# since 186 is not part of struc: it is one less
+#FIXME!

 # assign meaningful colnames 
 #snps_only.rename({0 : 'all_pnca_snps'}, axis = 1, inplace = True)
@ -480,24 +456,27 @@ snps_only.isna().sum() # should be 0
 # specify variable name for output file
 gene = 'pnca'
 #drug = 'pyrazinamide'
-my_fname1 = '_snps_'
+my_fname1 = '_snps'
 nrows = len(snps_only) 

 #output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
 #output_file_path = work_dir + '/Data/'
-output_file_path = data_dir + '/input/processed/' + drug + '/'
+#output_file_path = data_dir + '/input/processed/' + drug + '/'
+print(outdir)

-if not os.path.exists(output_file_path):
-    print( output_file_path, 'does not exist. Creating')
-    os.makedirs(output_file_path)
+if not os.path.exists(outdir):
+    print( outdir, 'does not exist. Creating')
+    os.makedirs(outdir)
    exit()

-output_filename = output_file_path + gene + my_fname1 + str(nrows) + '.csv'
-print(output_filename) #<<<- check
+#out_filename = gene + my_fname1 + str(nrows) + '.csv'
+out_filename = '/' + gene + my_fname1 + '.csv'
+outfile = outdir + out_filename
+print(outfile) #<<<- check

 # write to csv: without column or row names
 # Bad practice: numbers at the start of a filename
-snps_only.to_csv(output_filename, header = False, index = False)
+snps_only.to_csv(out_filename, header = False, index = False)

 #=========
 # Step 12b: all snps with annotation
@ -519,12 +498,23 @@ snps_only.to_csv(output_filename, header = False, index = False)
 #my_fname2 = '_snps_with_metadata_'
 #nrows = len(pnca_snps_ALL) 

+#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
 #output_file_path = work_dir + '/Data/'
-#output_filename = output_file_path + gene + my_fname2 + str(nrows) + '.csv'
-#print(output_filename)  #<<<- check
+#output_file_path = data_dir + '/input/processed/' + drug + '/'
+print(outdir)
+
+if not os.path.exists(outdir):
+    print( outdir, 'does not exist. Creating')
+    os.makedirs(outdir)
+    exit()
+
+#out_filename = gene + my_fname2 + str(nrows) + '.csv'
+out_filename = '/' + gene + my_fname1 + '.csv'
+outfile = outdir + out_filename
+print(outfile) #<<<- check

 # write out file
-#pnca_snps_ALL.to_csv(output_filename, header = True, index = False)
+#pnca_snps_ALL.to_csv(outfile, header = True, index = False)

 #=========
 # Step 12c: comp snps for OR calcs with annotation
@ -547,7 +537,7 @@ meta_pnca_LF2['mutation'].nunique()
 meta_pnca_LF2.groupby('mutation_info').nunique() 

 # sanity check
-meta_pnca_LF2['id'].nunique() 
+meta_pnca_LF2['id'].nunique()  #1908

 # should be True
 if meta_pnca_LF2['id'].nunique() == comp_pnca_samples:
@ -569,15 +559,26 @@ len(pnca_snps_COMP)

 gene = 'pnca'
 #drug = 'pyrazinamide'
-my_fname3 = '_comp_snps_with_metadata_'
+my_fname3 = '_comp_snps_with_metadata'
 nrows = len(pnca_snps_COMP) 

-#output_filename = output_file_path + gene + my_fname3 + str(nrows) + '.csv'
-#print(output_filename) #<<<-check
+#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
+#output_file_path = work_dir + '/Data/'
+#output_file_path = data_dir + '/input/processed/' + drug + '/'
+print(outdir)
+
+if not os.path.exists(outdir):
+    print( outdir, 'does not exist. Creating')
+    os.makedirs(outdir)
+    exit()
+
+#out_filename = gene + my_fname3 + str(nrows) + '.csv'
+#out_filename = '/' + gene + my_fname3 + '.csv'
+#outfile = outdir + out_filename
+#print(outfile) #<<<- check

 # write out file
-#pnca_snps_COMP.to_csv(output_filename, header = True, index = False)
-
+#pnca_snps_COMP.to_csv(outfile, header = True, index = False)

 #=========
 # Step 12d: comp snps only
@ -589,15 +590,26 @@ snps_only = pd.DataFrame(meta_pnca_LF2['Mutationinformation'].unique())

 gene = 'pnca'
 #drug = 'pyrazinamide'
-my_fname1 = '_comp_snps_'
+my_fname1 = '_comp_snps'
 nrows = len(snps_only) 

-output_filename = output_file_path + gene + my_fname1  + str(nrows) + '.csv'
-print(output_filename) #<<<- check
+#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
+#output_file_path = work_dir + '/Data/'
+#output_file_path = data_dir + '/input/processed/' + drug + '/'
+print(outdir)
+
+if not os.path.exists(outdir):
+    print( outdir, 'does not exist. Creating')
+    os.makedirs(outdir)
+    exit()
+
+#out_filename = gene + my_fname1 + str(nrows) + '.csv'
+out_filename = '/' + gene + my_fname1 + '.csv'
+outfile = outdir + out_filename
+print(outfile) #<<<- check

 # write to csv: without column or row names
-snps_only.to_csv(output_filename, header = False, index = False)
-
+snps_only.to_csv(outfile, header = False, index = False)

 #=#=#=#=#=#=#=#
 # COMMENT: LF1 is the file to extract all unique snps for mcsm 
@ -619,8 +631,21 @@ gene = 'pnca'
 #drug = 'pyrazinamide'
 my_fname4 = '_metadata'
 #nrows = len(meta_pnca_LF1)
-output_filename = output_file_path + gene + my_fname4  + '.csv'
-print(output_filename) #<<<-check
+
+#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
+#output_file_path = work_dir + '/Data/'
+#output_file_path = data_dir + '/input/processed/' + drug + '/'
+print(outdir)
+
+if not os.path.exists(outdir):
+    print( outdir, 'does not exist. Creating')
+    os.makedirs(outdir)
+    exit()
+
+#out_filename = gene + my_fname1 + str(nrows) + '.csv'
+out_filename = '/' + gene + my_fname4 + '.csv'
+outfile = outdir + out_filename
+print(outfile) #<<<- check

 # write out file
-meta_pnca_LF1.to_csv(output_filename) 
+meta_pnca_LF1.to_csv(outfile) 
--- a/meta_data_analysis/reference_dict.py
+++ b/meta_data_analysis/reference_dict.py
@ -6,7 +6,7 @@ Created on Tue Jun 18 11:32:28 2019
@author: tanushree
 """
 ############################################
-#load libraries
+# load libraries
 import pandas as pd
 import os
 #############################################
@ -17,14 +17,29 @@ import os
 # containing GWAS data
 #!#########################!

-print(os.getcwd()) 
-homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
-os.chdir(homedir + '/git/Data/input/original') 
+#print(os.getcwd()) 
+#homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
+#os.chdir(homedir + '/git/Data/pyrazinamide/input/original') 
 print(os.getcwd())
+
+#%%
+############# specify variables for input and output paths and filenames
+drug = 'pyrazinamide'
+#gene = 'pnca'
+
+datadir = homedir + '/git/Data'
+basedir =  datadir + '/' + drug + '/input'
+
+# input
+inpath = "/original"
+in_filename  = "/aa_codes.csv"
+infile = basedir + inpath + in_filename
+print(infile)
+
 #==========
 #read file
 #==========
-my_aa = pd.read_csv('aa_codes.csv') #20, 6
+my_aa = pd.read_csv(infile) #20, 6
 #assign the one_letter code as the row names so that it is easier to create a dict of dicts using index
 #my_aa = pd.read_csv('aa_codes.csv', index_col = 0) #20, 6  #a way to it since it is the first column
 my_aa = my_aa.set_index('three_letter_code_lower') #20, 5