saving previous work from home pc

This commit is contained in:
Tanushree Tunstall 2020-01-23 09:29:20 +00:00
parent 87060c036f
commit 772fd63d9f
3 changed files with 146 additions and 106 deletions

View file

@ -23,11 +23,7 @@ import pandas as pd
#my_dir = os.path.expanduser('~/some_dir')
#make sure mcsm_analysis/ exists
#or specify the output directory
#%%
#%%
#%%
#========================================================
#====================================================
# TASK: extract ALL pncA mutations from GWAS data
#========================================================
#%%
@ -40,83 +36,45 @@ os.getcwd()
#%%
from reference_dict import my_aa_dict #CHECK DIR STRUC THERE!
#%%
#NOTE: Out_dir MUST exis
# User defined dir strpyrazinamide
drug = 'pyrazinamide'
gene = 'pnca'
out_dir = homedir + '/git/LSHTM_analysis/mcsm_analysis/'
# = out_dir + drug
data_dir = homedir + '/git/Data'
############# specify variables for input and output paths and filenames
drug = "pyrazinamide"
gene = "pnca"
if not os.path.exists(data_dir):
print('Error!', data_dir, 'does not exist. Please ensure it exists and contains the appropriate raw data')
os.makedirs(data_dir)
datadir = homedir + "/git/Data"
basedir = datadir + "/" + drug + "/input"
# input
inpath = "/original"
in_filename = "/original_tanushree_data_v2.csv"
infile = basedir + inpath + in_filename
#print(infile)
# output: several output files
# output filenames in respective sections at the time of outputting files
outpath = "/processed"
outdir = basedir + outpath
#print(outdir)
if not os.path.exists(datadir):
print('Error!', datadir, 'does not exist. Please ensure it exists. Dir struc specified in README.md')
os.makedirs(datadir)
die()
if not os.path.exists(out_dir):
print('Error!', out_dir, 'does not exist. Please create it')
if not os.path.exists(outdir):
print('Error!', outdir, 'does not exist.Please ensure it exists. Dir struc specified in README.md')
exit()
#if not os.path.exists(work_dir):
# print('creating dir that does not exist', 'dir_name:', work_dir)
# os.makedirs(work_dir)
else:
print('Dir exists: Carrying on')
# now create sub dir structure within work_dir
# pyrazinamide/mcsm_analysis
# we need three dir
# Data
# Scripts
# Plotting
# Results
# Plots
# create a list of dir names
#dir_names = ['Data', 'Scripts', 'Results']
#for i in dir_names:
# this_dir = (work_dir + '/' + i)
# if not os.path.exists(this_dir):
# print('creating dir that does not exist:', this_dir)
# os.makedirs(this_dir)
#else:
# print('Dir exists: Carrying on')
# Create sub dirs
# 1)
# Scripts
# Plotting
#subdir_plotting = work_dir + '/Scripts/Plotting'
#if not os.path.exists(subdir_plotting):
# print('creating dir that does not exist:', subdir_plotting)
# os.makedirs(subdir_plotting)
#else:
# print('Dir exists: Carrying on')
# 2)
# Results
# Plots
#subdir_plots = work_dir + '/Results/Plots'
#if not os.path.exists(subdir_plots):
# print('creating dir that does not exist:', subdir_plots)
# os.makedirs(subdir_plots)
#else:
# print('Dir exists: Carrying on')
# clear varaibles
#del(dir_names, drug, i, subdir_plots, subdir_plotting)
#exit()
################## end of variable assignment for input and output files
#%%
#==============================================================================
############
# STEP 1: Read file original_tanushree_data_v2.csv
############
data_file = data_dir + '/input/original/original_tanushree_data_v2.csv'
meta_data = pd.read_csv(data_file, sep = ',')
#data_file = data_dir + '/input/original/original_tanushree_data_v2.csv'
meta_data = pd.read_csv(infile, sep = ',')
# column names
list(meta_data.columns)
@ -130,7 +88,7 @@ meta_data = meta_data[['id'
, 'pyrazinamide'
, 'dr_mutations_pyrazinamide'
, 'other_mutations_pyrazinamide'
]]
]] #19265, 67
# checks
total_samples = meta_data['id'].nunique() # 19265
@ -143,7 +101,12 @@ meta_data.head()
# equivalent of table in R
# pyrazinamide counts
meta_data.pyrazinamide.value_counts()
meta_data.pyrazinamide.value_counts() #12511
#0.0 10565
#1.0 1946 {RESULT: No. of Resistant and Susceptible samples}
clear variables
#del(basedir, datadir, inpath, infile)
#%%
############
@ -155,7 +118,16 @@ meta_data.pyrazinamide.value_counts()
# and drop the entries with NA
#############
meta_pza = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
#2163 {RESULT: samples with dr_muts}
dr_id = meta_pza['id'].unique()
meta_pza = meta_data.loc[meta_data.other_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
#526 (RESULT: samples with other_muts)
other_id = meta_pza['id'].unique()
# FIXME: See if the sample ids are unique in each
# find any common IDs
dr_id.isin(other_id[1,1])
del(meta_pza)
@ -188,8 +160,9 @@ del(meta_pnca_other)
# Now extract "all" mutations
meta_pnca_all = meta_data_pnca.loc[meta_data_pnca.dr_mutations_pyrazinamide.str.contains('pncA_p.*') | meta_data_pnca.other_mutations_pyrazinamide.str.contains('pncA_p.*') ]
#2665, 8
meta_pnca_all['id'].nunique()
meta_pnca_all['id'].nunique() {#RESULT: pnca mutations in ALL samples}
pnca_samples = len(meta_pnca_all)
pnca_na = meta_pnca_all['pyrazinamide'].isna().sum()
comp_pnca_samples = pnca_samples - pnca_na
@ -468,8 +441,11 @@ meta_pnca_LF1['Mutationinformation'] = meta_pnca_LF1['wild_type'] + meta_pnca_LF
#=========
# Step 12a: all SNPs to run mCSM
#=========
snps_only = pd.DataFrame(meta_pnca_LF1['Mutationinformation'].unique())
pos_only = pd.DataFrame(meta_pnca_LF1['position'].unique())
snps_only = pd.DataFrame(meta_pnca_LF1['Mutationinformation'].unique()) #336
pos_only = pd.DataFrame(meta_pnca_LF1['position'].unique()) #131
# since 186 is not part of struc: it is one less
#FIXME!
# assign meaningful colnames
#snps_only.rename({0 : 'all_pnca_snps'}, axis = 1, inplace = True)
@ -480,24 +456,27 @@ snps_only.isna().sum() # should be 0
# specify variable name for output file
gene = 'pnca'
#drug = 'pyrazinamide'
my_fname1 = '_snps_'
my_fname1 = '_snps'
nrows = len(snps_only)
#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
#output_file_path = work_dir + '/Data/'
output_file_path = data_dir + '/input/processed/' + drug + '/'
#output_file_path = data_dir + '/input/processed/' + drug + '/'
print(outdir)
if not os.path.exists(output_file_path):
print( output_file_path, 'does not exist. Creating')
os.makedirs(output_file_path)
if not os.path.exists(outdir):
print( outdir, 'does not exist. Creating')
os.makedirs(outdir)
exit()
output_filename = output_file_path + gene + my_fname1 + str(nrows) + '.csv'
print(output_filename) #<<<- check
#out_filename = gene + my_fname1 + str(nrows) + '.csv'
out_filename = '/' + gene + my_fname1 + '.csv'
outfile = outdir + out_filename
print(outfile) #<<<- check
# write to csv: without column or row names
# Bad practice: numbers at the start of a filename
snps_only.to_csv(output_filename, header = False, index = False)
snps_only.to_csv(out_filename, header = False, index = False)
#=========
# Step 12b: all snps with annotation
@ -519,12 +498,23 @@ snps_only.to_csv(output_filename, header = False, index = False)
#my_fname2 = '_snps_with_metadata_'
#nrows = len(pnca_snps_ALL)
#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
#output_file_path = work_dir + '/Data/'
#output_filename = output_file_path + gene + my_fname2 + str(nrows) + '.csv'
#print(output_filename) #<<<- check
#output_file_path = data_dir + '/input/processed/' + drug + '/'
print(outdir)
if not os.path.exists(outdir):
print( outdir, 'does not exist. Creating')
os.makedirs(outdir)
exit()
#out_filename = gene + my_fname2 + str(nrows) + '.csv'
out_filename = '/' + gene + my_fname1 + '.csv'
outfile = outdir + out_filename
print(outfile) #<<<- check
# write out file
#pnca_snps_ALL.to_csv(output_filename, header = True, index = False)
#pnca_snps_ALL.to_csv(outfile, header = True, index = False)
#=========
# Step 12c: comp snps for OR calcs with annotation
@ -547,7 +537,7 @@ meta_pnca_LF2['mutation'].nunique()
meta_pnca_LF2.groupby('mutation_info').nunique()
# sanity check
meta_pnca_LF2['id'].nunique()
meta_pnca_LF2['id'].nunique() #1908
# should be True
if meta_pnca_LF2['id'].nunique() == comp_pnca_samples:
@ -569,15 +559,26 @@ len(pnca_snps_COMP)
gene = 'pnca'
#drug = 'pyrazinamide'
my_fname3 = '_comp_snps_with_metadata_'
my_fname3 = '_comp_snps_with_metadata'
nrows = len(pnca_snps_COMP)
#output_filename = output_file_path + gene + my_fname3 + str(nrows) + '.csv'
#print(output_filename) #<<<-check
#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
#output_file_path = work_dir + '/Data/'
#output_file_path = data_dir + '/input/processed/' + drug + '/'
print(outdir)
if not os.path.exists(outdir):
print( outdir, 'does not exist. Creating')
os.makedirs(outdir)
exit()
#out_filename = gene + my_fname3 + str(nrows) + '.csv'
#out_filename = '/' + gene + my_fname3 + '.csv'
#outfile = outdir + out_filename
#print(outfile) #<<<- check
# write out file
#pnca_snps_COMP.to_csv(output_filename, header = True, index = False)
#pnca_snps_COMP.to_csv(outfile, header = True, index = False)
#=========
# Step 12d: comp snps only
@ -589,15 +590,26 @@ snps_only = pd.DataFrame(meta_pnca_LF2['Mutationinformation'].unique())
gene = 'pnca'
#drug = 'pyrazinamide'
my_fname1 = '_comp_snps_'
my_fname1 = '_comp_snps'
nrows = len(snps_only)
output_filename = output_file_path + gene + my_fname1 + str(nrows) + '.csv'
print(output_filename) #<<<- check
#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
#output_file_path = work_dir + '/Data/'
#output_file_path = data_dir + '/input/processed/' + drug + '/'
print(outdir)
if not os.path.exists(outdir):
print( outdir, 'does not exist. Creating')
os.makedirs(outdir)
exit()
#out_filename = gene + my_fname1 + str(nrows) + '.csv'
out_filename = '/' + gene + my_fname1 + '.csv'
outfile = outdir + out_filename
print(outfile) #<<<- check
# write to csv: without column or row names
snps_only.to_csv(output_filename, header = False, index = False)
snps_only.to_csv(outfile, header = False, index = False)
#=#=#=#=#=#=#=#
# COMMENT: LF1 is the file to extract all unique snps for mcsm
@ -619,8 +631,21 @@ gene = 'pnca'
#drug = 'pyrazinamide'
my_fname4 = '_metadata'
#nrows = len(meta_pnca_LF1)
output_filename = output_file_path + gene + my_fname4 + '.csv'
print(output_filename) #<<<-check
#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
#output_file_path = work_dir + '/Data/'
#output_file_path = data_dir + '/input/processed/' + drug + '/'
print(outdir)
if not os.path.exists(outdir):
print( outdir, 'does not exist. Creating')
os.makedirs(outdir)
exit()
#out_filename = gene + my_fname1 + str(nrows) + '.csv'
out_filename = '/' + gene + my_fname4 + '.csv'
outfile = outdir + out_filename
print(outfile) #<<<- check
# write out file
meta_pnca_LF1.to_csv(output_filename)
meta_pnca_LF1.to_csv(outfile)

View file

@ -6,7 +6,7 @@ Created on Tue Jun 18 11:32:28 2019
@author: tanushree
"""
############################################
#load libraries
# load libraries
import pandas as pd
import os
#############################################
@ -17,14 +17,29 @@ import os
# containing GWAS data
#!#########################!
#print(os.getcwd())
#homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
#os.chdir(homedir + '/git/Data/pyrazinamide/input/original')
print(os.getcwd())
homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
os.chdir(homedir + '/git/Data/input/original')
print(os.getcwd())
#%%
############# specify variables for input and output paths and filenames
drug = 'pyrazinamide'
#gene = 'pnca'
datadir = homedir + '/git/Data'
basedir = datadir + '/' + drug + '/input'
# input
inpath = "/original"
in_filename = "/aa_codes.csv"
infile = basedir + inpath + in_filename
print(infile)
#==========
#read file
#==========
my_aa = pd.read_csv('aa_codes.csv') #20, 6
my_aa = pd.read_csv(infile) #20, 6
#assign the one_letter code as the row names so that it is easier to create a dict of dicts using index
#my_aa = pd.read_csv('aa_codes.csv', index_col = 0) #20, 6 #a way to it since it is the first column
my_aa = my_aa.set_index('three_letter_code_lower') #20, 5