saving previous work from home pc
This commit is contained in:
parent
c56a5e4497
commit
0ebd8a6d4b
3 changed files with 146 additions and 106 deletions
Binary file not shown.
|
@ -23,11 +23,7 @@ import pandas as pd
|
|||
#my_dir = os.path.expanduser('~/some_dir')
|
||||
#make sure mcsm_analysis/ exists
|
||||
#or specify the output directory
|
||||
|
||||
#%%
|
||||
#%%
|
||||
#%%
|
||||
#========================================================
|
||||
#====================================================
|
||||
# TASK: extract ALL pncA mutations from GWAS data
|
||||
#========================================================
|
||||
#%%
|
||||
|
@ -40,83 +36,45 @@ os.getcwd()
|
|||
#%%
|
||||
from reference_dict import my_aa_dict #CHECK DIR STRUC THERE!
|
||||
#%%
|
||||
#NOTE: Out_dir MUST exis
|
||||
# User defined dir strpyrazinamide
|
||||
drug = 'pyrazinamide'
|
||||
gene = 'pnca'
|
||||
out_dir = homedir + '/git/LSHTM_analysis/mcsm_analysis/'
|
||||
# = out_dir + drug
|
||||
data_dir = homedir + '/git/Data'
|
||||
############# specify variables for input and output paths and filenames
|
||||
drug = "pyrazinamide"
|
||||
gene = "pnca"
|
||||
|
||||
if not os.path.exists(data_dir):
|
||||
print('Error!', data_dir, 'does not exist. Please ensure it exists and contains the appropriate raw data')
|
||||
os.makedirs(data_dir)
|
||||
datadir = homedir + "/git/Data"
|
||||
basedir = datadir + "/" + drug + "/input"
|
||||
|
||||
# input
|
||||
inpath = "/original"
|
||||
in_filename = "/original_tanushree_data_v2.csv"
|
||||
infile = basedir + inpath + in_filename
|
||||
#print(infile)
|
||||
|
||||
# output: several output files
|
||||
# output filenames in respective sections at the time of outputting files
|
||||
outpath = "/processed"
|
||||
outdir = basedir + outpath
|
||||
#print(outdir)
|
||||
|
||||
if not os.path.exists(datadir):
|
||||
print('Error!', datadir, 'does not exist. Please ensure it exists. Dir struc specified in README.md')
|
||||
os.makedirs(datadir)
|
||||
die()
|
||||
|
||||
if not os.path.exists(out_dir):
|
||||
print('Error!', out_dir, 'does not exist. Please create it')
|
||||
if not os.path.exists(outdir):
|
||||
print('Error!', outdir, 'does not exist.Please ensure it exists. Dir struc specified in README.md')
|
||||
exit()
|
||||
|
||||
#if not os.path.exists(work_dir):
|
||||
# print('creating dir that does not exist', 'dir_name:', work_dir)
|
||||
# os.makedirs(work_dir)
|
||||
else:
|
||||
print('Dir exists: Carrying on')
|
||||
|
||||
# now create sub dir structure within work_dir
|
||||
# pyrazinamide/mcsm_analysis
|
||||
|
||||
# we need three dir
|
||||
# Data
|
||||
# Scripts
|
||||
# Plotting
|
||||
# Results
|
||||
# Plots
|
||||
|
||||
# create a list of dir names
|
||||
#dir_names = ['Data', 'Scripts', 'Results']
|
||||
|
||||
|
||||
#for i in dir_names:
|
||||
# this_dir = (work_dir + '/' + i)
|
||||
# if not os.path.exists(this_dir):
|
||||
# print('creating dir that does not exist:', this_dir)
|
||||
# os.makedirs(this_dir)
|
||||
#else:
|
||||
# print('Dir exists: Carrying on')
|
||||
|
||||
# Create sub dirs
|
||||
# 1)
|
||||
# Scripts
|
||||
# Plotting
|
||||
#subdir_plotting = work_dir + '/Scripts/Plotting'
|
||||
#if not os.path.exists(subdir_plotting):
|
||||
# print('creating dir that does not exist:', subdir_plotting)
|
||||
# os.makedirs(subdir_plotting)
|
||||
#else:
|
||||
# print('Dir exists: Carrying on')
|
||||
|
||||
# 2)
|
||||
# Results
|
||||
# Plots
|
||||
#subdir_plots = work_dir + '/Results/Plots'
|
||||
#if not os.path.exists(subdir_plots):
|
||||
# print('creating dir that does not exist:', subdir_plots)
|
||||
# os.makedirs(subdir_plots)
|
||||
#else:
|
||||
# print('Dir exists: Carrying on')
|
||||
|
||||
# clear varaibles
|
||||
#del(dir_names, drug, i, subdir_plots, subdir_plotting)
|
||||
|
||||
#exit()
|
||||
################## end of variable assignment for input and output files
|
||||
#%%
|
||||
#==============================================================================
|
||||
############
|
||||
# STEP 1: Read file original_tanushree_data_v2.csv
|
||||
############
|
||||
data_file = data_dir + '/input/original/original_tanushree_data_v2.csv'
|
||||
meta_data = pd.read_csv(data_file, sep = ',')
|
||||
#data_file = data_dir + '/input/original/original_tanushree_data_v2.csv'
|
||||
meta_data = pd.read_csv(infile, sep = ',')
|
||||
|
||||
# column names
|
||||
list(meta_data.columns)
|
||||
|
@ -130,7 +88,7 @@ meta_data = meta_data[['id'
|
|||
, 'pyrazinamide'
|
||||
, 'dr_mutations_pyrazinamide'
|
||||
, 'other_mutations_pyrazinamide'
|
||||
]]
|
||||
]] #19265, 67
|
||||
|
||||
# checks
|
||||
total_samples = meta_data['id'].nunique() # 19265
|
||||
|
@ -143,7 +101,12 @@ meta_data.head()
|
|||
|
||||
# equivalent of table in R
|
||||
# pyrazinamide counts
|
||||
meta_data.pyrazinamide.value_counts()
|
||||
meta_data.pyrazinamide.value_counts() #12511
|
||||
#0.0 10565
|
||||
#1.0 1946 {RESULT: No. of Resistant and Susceptible samples}
|
||||
|
||||
clear variables
|
||||
#del(basedir, datadir, inpath, infile)
|
||||
|
||||
#%%
|
||||
############
|
||||
|
@ -154,8 +117,17 @@ meta_data.pyrazinamide.value_counts()
|
|||
# (corresponding to a structure)
|
||||
# and drop the entries with NA
|
||||
#############
|
||||
meta_pza = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
|
||||
meta_pza = meta_data.loc[meta_data.other_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
|
||||
meta_pza = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
|
||||
#2163 {RESULT: samples with dr_muts}
|
||||
dr_id = meta_pza['id'].unique()
|
||||
|
||||
meta_pza = meta_data.loc[meta_data.other_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
|
||||
#526 (RESULT: samples with other_muts)
|
||||
other_id = meta_pza['id'].unique()
|
||||
|
||||
# FIXME: See if the sample ids are unique in each
|
||||
# find any common IDs
|
||||
dr_id.isin(other_id[1,1])
|
||||
|
||||
del(meta_pza)
|
||||
|
||||
|
@ -188,8 +160,9 @@ del(meta_pnca_other)
|
|||
|
||||
# Now extract "all" mutations
|
||||
meta_pnca_all = meta_data_pnca.loc[meta_data_pnca.dr_mutations_pyrazinamide.str.contains('pncA_p.*') | meta_data_pnca.other_mutations_pyrazinamide.str.contains('pncA_p.*') ]
|
||||
#2665, 8
|
||||
|
||||
meta_pnca_all['id'].nunique()
|
||||
meta_pnca_all['id'].nunique() {#RESULT: pnca mutations in ALL samples}
|
||||
pnca_samples = len(meta_pnca_all)
|
||||
pnca_na = meta_pnca_all['pyrazinamide'].isna().sum()
|
||||
comp_pnca_samples = pnca_samples - pnca_na
|
||||
|
@ -468,8 +441,11 @@ meta_pnca_LF1['Mutationinformation'] = meta_pnca_LF1['wild_type'] + meta_pnca_LF
|
|||
#=========
|
||||
# Step 12a: all SNPs to run mCSM
|
||||
#=========
|
||||
snps_only = pd.DataFrame(meta_pnca_LF1['Mutationinformation'].unique())
|
||||
pos_only = pd.DataFrame(meta_pnca_LF1['position'].unique())
|
||||
snps_only = pd.DataFrame(meta_pnca_LF1['Mutationinformation'].unique()) #336
|
||||
pos_only = pd.DataFrame(meta_pnca_LF1['position'].unique()) #131
|
||||
|
||||
# since 186 is not part of struc: it is one less
|
||||
#FIXME!
|
||||
|
||||
# assign meaningful colnames
|
||||
#snps_only.rename({0 : 'all_pnca_snps'}, axis = 1, inplace = True)
|
||||
|
@ -480,24 +456,27 @@ snps_only.isna().sum() # should be 0
|
|||
# specify variable name for output file
|
||||
gene = 'pnca'
|
||||
#drug = 'pyrazinamide'
|
||||
my_fname1 = '_snps_'
|
||||
my_fname1 = '_snps'
|
||||
nrows = len(snps_only)
|
||||
|
||||
#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
|
||||
#output_file_path = work_dir + '/Data/'
|
||||
output_file_path = data_dir + '/input/processed/' + drug + '/'
|
||||
#output_file_path = data_dir + '/input/processed/' + drug + '/'
|
||||
print(outdir)
|
||||
|
||||
if not os.path.exists(output_file_path):
|
||||
print( output_file_path, 'does not exist. Creating')
|
||||
os.makedirs(output_file_path)
|
||||
if not os.path.exists(outdir):
|
||||
print( outdir, 'does not exist. Creating')
|
||||
os.makedirs(outdir)
|
||||
exit()
|
||||
|
||||
output_filename = output_file_path + gene + my_fname1 + str(nrows) + '.csv'
|
||||
print(output_filename) #<<<- check
|
||||
#out_filename = gene + my_fname1 + str(nrows) + '.csv'
|
||||
out_filename = '/' + gene + my_fname1 + '.csv'
|
||||
outfile = outdir + out_filename
|
||||
print(outfile) #<<<- check
|
||||
|
||||
# write to csv: without column or row names
|
||||
# Bad practice: numbers at the start of a filename
|
||||
snps_only.to_csv(output_filename, header = False, index = False)
|
||||
snps_only.to_csv(out_filename, header = False, index = False)
|
||||
|
||||
#=========
|
||||
# Step 12b: all snps with annotation
|
||||
|
@ -519,12 +498,23 @@ snps_only.to_csv(output_filename, header = False, index = False)
|
|||
#my_fname2 = '_snps_with_metadata_'
|
||||
#nrows = len(pnca_snps_ALL)
|
||||
|
||||
#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
|
||||
#output_file_path = work_dir + '/Data/'
|
||||
#output_filename = output_file_path + gene + my_fname2 + str(nrows) + '.csv'
|
||||
#print(output_filename) #<<<- check
|
||||
#output_file_path = data_dir + '/input/processed/' + drug + '/'
|
||||
print(outdir)
|
||||
|
||||
if not os.path.exists(outdir):
|
||||
print( outdir, 'does not exist. Creating')
|
||||
os.makedirs(outdir)
|
||||
exit()
|
||||
|
||||
#out_filename = gene + my_fname2 + str(nrows) + '.csv'
|
||||
out_filename = '/' + gene + my_fname1 + '.csv'
|
||||
outfile = outdir + out_filename
|
||||
print(outfile) #<<<- check
|
||||
|
||||
# write out file
|
||||
#pnca_snps_ALL.to_csv(output_filename, header = True, index = False)
|
||||
#pnca_snps_ALL.to_csv(outfile, header = True, index = False)
|
||||
|
||||
#=========
|
||||
# Step 12c: comp snps for OR calcs with annotation
|
||||
|
@ -547,7 +537,7 @@ meta_pnca_LF2['mutation'].nunique()
|
|||
meta_pnca_LF2.groupby('mutation_info').nunique()
|
||||
|
||||
# sanity check
|
||||
meta_pnca_LF2['id'].nunique()
|
||||
meta_pnca_LF2['id'].nunique() #1908
|
||||
|
||||
# should be True
|
||||
if meta_pnca_LF2['id'].nunique() == comp_pnca_samples:
|
||||
|
@ -569,15 +559,26 @@ len(pnca_snps_COMP)
|
|||
|
||||
gene = 'pnca'
|
||||
#drug = 'pyrazinamide'
|
||||
my_fname3 = '_comp_snps_with_metadata_'
|
||||
my_fname3 = '_comp_snps_with_metadata'
|
||||
nrows = len(pnca_snps_COMP)
|
||||
|
||||
#output_filename = output_file_path + gene + my_fname3 + str(nrows) + '.csv'
|
||||
#print(output_filename) #<<<-check
|
||||
#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
|
||||
#output_file_path = work_dir + '/Data/'
|
||||
#output_file_path = data_dir + '/input/processed/' + drug + '/'
|
||||
print(outdir)
|
||||
|
||||
if not os.path.exists(outdir):
|
||||
print( outdir, 'does not exist. Creating')
|
||||
os.makedirs(outdir)
|
||||
exit()
|
||||
|
||||
#out_filename = gene + my_fname3 + str(nrows) + '.csv'
|
||||
#out_filename = '/' + gene + my_fname3 + '.csv'
|
||||
#outfile = outdir + out_filename
|
||||
#print(outfile) #<<<- check
|
||||
|
||||
# write out file
|
||||
#pnca_snps_COMP.to_csv(output_filename, header = True, index = False)
|
||||
|
||||
#pnca_snps_COMP.to_csv(outfile, header = True, index = False)
|
||||
|
||||
#=========
|
||||
# Step 12d: comp snps only
|
||||
|
@ -589,15 +590,26 @@ snps_only = pd.DataFrame(meta_pnca_LF2['Mutationinformation'].unique())
|
|||
|
||||
gene = 'pnca'
|
||||
#drug = 'pyrazinamide'
|
||||
my_fname1 = '_comp_snps_'
|
||||
my_fname1 = '_comp_snps'
|
||||
nrows = len(snps_only)
|
||||
|
||||
output_filename = output_file_path + gene + my_fname1 + str(nrows) + '.csv'
|
||||
print(output_filename) #<<<- check
|
||||
#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
|
||||
#output_file_path = work_dir + '/Data/'
|
||||
#output_file_path = data_dir + '/input/processed/' + drug + '/'
|
||||
print(outdir)
|
||||
|
||||
if not os.path.exists(outdir):
|
||||
print( outdir, 'does not exist. Creating')
|
||||
os.makedirs(outdir)
|
||||
exit()
|
||||
|
||||
#out_filename = gene + my_fname1 + str(nrows) + '.csv'
|
||||
out_filename = '/' + gene + my_fname1 + '.csv'
|
||||
outfile = outdir + out_filename
|
||||
print(outfile) #<<<- check
|
||||
|
||||
# write to csv: without column or row names
|
||||
snps_only.to_csv(output_filename, header = False, index = False)
|
||||
|
||||
snps_only.to_csv(outfile, header = False, index = False)
|
||||
|
||||
#=#=#=#=#=#=#=#
|
||||
# COMMENT: LF1 is the file to extract all unique snps for mcsm
|
||||
|
@ -619,8 +631,21 @@ gene = 'pnca'
|
|||
#drug = 'pyrazinamide'
|
||||
my_fname4 = '_metadata'
|
||||
#nrows = len(meta_pnca_LF1)
|
||||
output_filename = output_file_path + gene + my_fname4 + '.csv'
|
||||
print(output_filename) #<<<-check
|
||||
|
||||
#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
|
||||
#output_file_path = work_dir + '/Data/'
|
||||
#output_file_path = data_dir + '/input/processed/' + drug + '/'
|
||||
print(outdir)
|
||||
|
||||
if not os.path.exists(outdir):
|
||||
print( outdir, 'does not exist. Creating')
|
||||
os.makedirs(outdir)
|
||||
exit()
|
||||
|
||||
#out_filename = gene + my_fname1 + str(nrows) + '.csv'
|
||||
out_filename = '/' + gene + my_fname4 + '.csv'
|
||||
outfile = outdir + out_filename
|
||||
print(outfile) #<<<- check
|
||||
|
||||
# write out file
|
||||
meta_pnca_LF1.to_csv(output_filename)
|
||||
meta_pnca_LF1.to_csv(outfile)
|
||||
|
|
|
@ -6,7 +6,7 @@ Created on Tue Jun 18 11:32:28 2019
|
|||
@author: tanushree
|
||||
"""
|
||||
############################################
|
||||
#load libraries
|
||||
# load libraries
|
||||
import pandas as pd
|
||||
import os
|
||||
#############################################
|
||||
|
@ -17,14 +17,29 @@ import os
|
|||
# containing GWAS data
|
||||
#!#########################!
|
||||
|
||||
print(os.getcwd())
|
||||
homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
|
||||
os.chdir(homedir + '/git/Data/input/original')
|
||||
#print(os.getcwd())
|
||||
#homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
|
||||
#os.chdir(homedir + '/git/Data/pyrazinamide/input/original')
|
||||
print(os.getcwd())
|
||||
|
||||
#%%
|
||||
############# specify variables for input and output paths and filenames
|
||||
drug = 'pyrazinamide'
|
||||
#gene = 'pnca'
|
||||
|
||||
datadir = homedir + '/git/Data'
|
||||
basedir = datadir + '/' + drug + '/input'
|
||||
|
||||
# input
|
||||
inpath = "/original"
|
||||
in_filename = "/aa_codes.csv"
|
||||
infile = basedir + inpath + in_filename
|
||||
print(infile)
|
||||
|
||||
#==========
|
||||
#read file
|
||||
#==========
|
||||
my_aa = pd.read_csv('aa_codes.csv') #20, 6
|
||||
my_aa = pd.read_csv(infile) #20, 6
|
||||
#assign the one_letter code as the row names so that it is easier to create a dict of dicts using index
|
||||
#my_aa = pd.read_csv('aa_codes.csv', index_col = 0) #20, 6 #a way to it since it is the first column
|
||||
my_aa = my_aa.set_index('three_letter_code_lower') #20, 5
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue