saving previous work from home pc
This commit is contained in:
parent
cae9c550a4
commit
be213cb7e9
3 changed files with 146 additions and 106 deletions
Binary file not shown.
|
@ -23,11 +23,7 @@ import pandas as pd
|
||||||
#my_dir = os.path.expanduser('~/some_dir')
|
#my_dir = os.path.expanduser('~/some_dir')
|
||||||
#make sure mcsm_analysis/ exists
|
#make sure mcsm_analysis/ exists
|
||||||
#or specify the output directory
|
#or specify the output directory
|
||||||
|
#====================================================
|
||||||
#%%
|
|
||||||
#%%
|
|
||||||
#%%
|
|
||||||
#========================================================
|
|
||||||
# TASK: extract ALL pncA mutations from GWAS data
|
# TASK: extract ALL pncA mutations from GWAS data
|
||||||
#========================================================
|
#========================================================
|
||||||
#%%
|
#%%
|
||||||
|
@ -40,83 +36,45 @@ os.getcwd()
|
||||||
#%%
|
#%%
|
||||||
from reference_dict import my_aa_dict #CHECK DIR STRUC THERE!
|
from reference_dict import my_aa_dict #CHECK DIR STRUC THERE!
|
||||||
#%%
|
#%%
|
||||||
#NOTE: Out_dir MUST exis
|
############# specify variables for input and output paths and filenames
|
||||||
# User defined dir strpyrazinamide
|
drug = "pyrazinamide"
|
||||||
drug = 'pyrazinamide'
|
gene = "pnca"
|
||||||
gene = 'pnca'
|
|
||||||
out_dir = homedir + '/git/LSHTM_analysis/mcsm_analysis/'
|
|
||||||
# = out_dir + drug
|
|
||||||
data_dir = homedir + '/git/Data'
|
|
||||||
|
|
||||||
if not os.path.exists(data_dir):
|
datadir = homedir + "/git/Data"
|
||||||
print('Error!', data_dir, 'does not exist. Please ensure it exists and contains the appropriate raw data')
|
basedir = datadir + "/" + drug + "/input"
|
||||||
os.makedirs(data_dir)
|
|
||||||
|
# input
|
||||||
|
inpath = "/original"
|
||||||
|
in_filename = "/original_tanushree_data_v2.csv"
|
||||||
|
infile = basedir + inpath + in_filename
|
||||||
|
#print(infile)
|
||||||
|
|
||||||
|
# output: several output files
|
||||||
|
# output filenames in respective sections at the time of outputting files
|
||||||
|
outpath = "/processed"
|
||||||
|
outdir = basedir + outpath
|
||||||
|
#print(outdir)
|
||||||
|
|
||||||
|
if not os.path.exists(datadir):
|
||||||
|
print('Error!', datadir, 'does not exist. Please ensure it exists. Dir struc specified in README.md')
|
||||||
|
os.makedirs(datadir)
|
||||||
die()
|
die()
|
||||||
|
|
||||||
if not os.path.exists(out_dir):
|
if not os.path.exists(outdir):
|
||||||
print('Error!', out_dir, 'does not exist. Please create it')
|
print('Error!', outdir, 'does not exist.Please ensure it exists. Dir struc specified in README.md')
|
||||||
exit()
|
exit()
|
||||||
|
|
||||||
#if not os.path.exists(work_dir):
|
|
||||||
# print('creating dir that does not exist', 'dir_name:', work_dir)
|
|
||||||
# os.makedirs(work_dir)
|
|
||||||
else:
|
else:
|
||||||
print('Dir exists: Carrying on')
|
print('Dir exists: Carrying on')
|
||||||
|
|
||||||
# now create sub dir structure within work_dir
|
################## end of variable assignment for input and output files
|
||||||
# pyrazinamide/mcsm_analysis
|
|
||||||
|
|
||||||
# we need three dir
|
|
||||||
# Data
|
|
||||||
# Scripts
|
|
||||||
# Plotting
|
|
||||||
# Results
|
|
||||||
# Plots
|
|
||||||
|
|
||||||
# create a list of dir names
|
|
||||||
#dir_names = ['Data', 'Scripts', 'Results']
|
|
||||||
|
|
||||||
|
|
||||||
#for i in dir_names:
|
|
||||||
# this_dir = (work_dir + '/' + i)
|
|
||||||
# if not os.path.exists(this_dir):
|
|
||||||
# print('creating dir that does not exist:', this_dir)
|
|
||||||
# os.makedirs(this_dir)
|
|
||||||
#else:
|
|
||||||
# print('Dir exists: Carrying on')
|
|
||||||
|
|
||||||
# Create sub dirs
|
|
||||||
# 1)
|
|
||||||
# Scripts
|
|
||||||
# Plotting
|
|
||||||
#subdir_plotting = work_dir + '/Scripts/Plotting'
|
|
||||||
#if not os.path.exists(subdir_plotting):
|
|
||||||
# print('creating dir that does not exist:', subdir_plotting)
|
|
||||||
# os.makedirs(subdir_plotting)
|
|
||||||
#else:
|
|
||||||
# print('Dir exists: Carrying on')
|
|
||||||
|
|
||||||
# 2)
|
|
||||||
# Results
|
|
||||||
# Plots
|
|
||||||
#subdir_plots = work_dir + '/Results/Plots'
|
|
||||||
#if not os.path.exists(subdir_plots):
|
|
||||||
# print('creating dir that does not exist:', subdir_plots)
|
|
||||||
# os.makedirs(subdir_plots)
|
|
||||||
#else:
|
|
||||||
# print('Dir exists: Carrying on')
|
|
||||||
|
|
||||||
# clear varaibles
|
|
||||||
#del(dir_names, drug, i, subdir_plots, subdir_plotting)
|
|
||||||
|
|
||||||
#exit()
|
|
||||||
#%%
|
#%%
|
||||||
#==============================================================================
|
#==============================================================================
|
||||||
############
|
############
|
||||||
# STEP 1: Read file original_tanushree_data_v2.csv
|
# STEP 1: Read file original_tanushree_data_v2.csv
|
||||||
############
|
############
|
||||||
data_file = data_dir + '/input/original/original_tanushree_data_v2.csv'
|
#data_file = data_dir + '/input/original/original_tanushree_data_v2.csv'
|
||||||
meta_data = pd.read_csv(data_file, sep = ',')
|
meta_data = pd.read_csv(infile, sep = ',')
|
||||||
|
|
||||||
# column names
|
# column names
|
||||||
list(meta_data.columns)
|
list(meta_data.columns)
|
||||||
|
@ -130,7 +88,7 @@ meta_data = meta_data[['id'
|
||||||
, 'pyrazinamide'
|
, 'pyrazinamide'
|
||||||
, 'dr_mutations_pyrazinamide'
|
, 'dr_mutations_pyrazinamide'
|
||||||
, 'other_mutations_pyrazinamide'
|
, 'other_mutations_pyrazinamide'
|
||||||
]]
|
]] #19265, 67
|
||||||
|
|
||||||
# checks
|
# checks
|
||||||
total_samples = meta_data['id'].nunique() # 19265
|
total_samples = meta_data['id'].nunique() # 19265
|
||||||
|
@ -143,7 +101,12 @@ meta_data.head()
|
||||||
|
|
||||||
# equivalent of table in R
|
# equivalent of table in R
|
||||||
# pyrazinamide counts
|
# pyrazinamide counts
|
||||||
meta_data.pyrazinamide.value_counts()
|
meta_data.pyrazinamide.value_counts() #12511
|
||||||
|
#0.0 10565
|
||||||
|
#1.0 1946 {RESULT: No. of Resistant and Susceptible samples}
|
||||||
|
|
||||||
|
clear variables
|
||||||
|
#del(basedir, datadir, inpath, infile)
|
||||||
|
|
||||||
#%%
|
#%%
|
||||||
############
|
############
|
||||||
|
@ -155,7 +118,16 @@ meta_data.pyrazinamide.value_counts()
|
||||||
# and drop the entries with NA
|
# and drop the entries with NA
|
||||||
#############
|
#############
|
||||||
meta_pza = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
|
meta_pza = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
|
||||||
|
#2163 {RESULT: samples with dr_muts}
|
||||||
|
dr_id = meta_pza['id'].unique()
|
||||||
|
|
||||||
meta_pza = meta_data.loc[meta_data.other_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
|
meta_pza = meta_data.loc[meta_data.other_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
|
||||||
|
#526 (RESULT: samples with other_muts)
|
||||||
|
other_id = meta_pza['id'].unique()
|
||||||
|
|
||||||
|
# FIXME: See if the sample ids are unique in each
|
||||||
|
# find any common IDs
|
||||||
|
dr_id.isin(other_id[1,1])
|
||||||
|
|
||||||
del(meta_pza)
|
del(meta_pza)
|
||||||
|
|
||||||
|
@ -188,8 +160,9 @@ del(meta_pnca_other)
|
||||||
|
|
||||||
# Now extract "all" mutations
|
# Now extract "all" mutations
|
||||||
meta_pnca_all = meta_data_pnca.loc[meta_data_pnca.dr_mutations_pyrazinamide.str.contains('pncA_p.*') | meta_data_pnca.other_mutations_pyrazinamide.str.contains('pncA_p.*') ]
|
meta_pnca_all = meta_data_pnca.loc[meta_data_pnca.dr_mutations_pyrazinamide.str.contains('pncA_p.*') | meta_data_pnca.other_mutations_pyrazinamide.str.contains('pncA_p.*') ]
|
||||||
|
#2665, 8
|
||||||
|
|
||||||
meta_pnca_all['id'].nunique()
|
meta_pnca_all['id'].nunique() {#RESULT: pnca mutations in ALL samples}
|
||||||
pnca_samples = len(meta_pnca_all)
|
pnca_samples = len(meta_pnca_all)
|
||||||
pnca_na = meta_pnca_all['pyrazinamide'].isna().sum()
|
pnca_na = meta_pnca_all['pyrazinamide'].isna().sum()
|
||||||
comp_pnca_samples = pnca_samples - pnca_na
|
comp_pnca_samples = pnca_samples - pnca_na
|
||||||
|
@ -468,8 +441,11 @@ meta_pnca_LF1['Mutationinformation'] = meta_pnca_LF1['wild_type'] + meta_pnca_LF
|
||||||
#=========
|
#=========
|
||||||
# Step 12a: all SNPs to run mCSM
|
# Step 12a: all SNPs to run mCSM
|
||||||
#=========
|
#=========
|
||||||
snps_only = pd.DataFrame(meta_pnca_LF1['Mutationinformation'].unique())
|
snps_only = pd.DataFrame(meta_pnca_LF1['Mutationinformation'].unique()) #336
|
||||||
pos_only = pd.DataFrame(meta_pnca_LF1['position'].unique())
|
pos_only = pd.DataFrame(meta_pnca_LF1['position'].unique()) #131
|
||||||
|
|
||||||
|
# since 186 is not part of struc: it is one less
|
||||||
|
#FIXME!
|
||||||
|
|
||||||
# assign meaningful colnames
|
# assign meaningful colnames
|
||||||
#snps_only.rename({0 : 'all_pnca_snps'}, axis = 1, inplace = True)
|
#snps_only.rename({0 : 'all_pnca_snps'}, axis = 1, inplace = True)
|
||||||
|
@ -480,24 +456,27 @@ snps_only.isna().sum() # should be 0
|
||||||
# specify variable name for output file
|
# specify variable name for output file
|
||||||
gene = 'pnca'
|
gene = 'pnca'
|
||||||
#drug = 'pyrazinamide'
|
#drug = 'pyrazinamide'
|
||||||
my_fname1 = '_snps_'
|
my_fname1 = '_snps'
|
||||||
nrows = len(snps_only)
|
nrows = len(snps_only)
|
||||||
|
|
||||||
#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
|
#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
|
||||||
#output_file_path = work_dir + '/Data/'
|
#output_file_path = work_dir + '/Data/'
|
||||||
output_file_path = data_dir + '/input/processed/' + drug + '/'
|
#output_file_path = data_dir + '/input/processed/' + drug + '/'
|
||||||
|
print(outdir)
|
||||||
|
|
||||||
if not os.path.exists(output_file_path):
|
if not os.path.exists(outdir):
|
||||||
print( output_file_path, 'does not exist. Creating')
|
print( outdir, 'does not exist. Creating')
|
||||||
os.makedirs(output_file_path)
|
os.makedirs(outdir)
|
||||||
exit()
|
exit()
|
||||||
|
|
||||||
output_filename = output_file_path + gene + my_fname1 + str(nrows) + '.csv'
|
#out_filename = gene + my_fname1 + str(nrows) + '.csv'
|
||||||
print(output_filename) #<<<- check
|
out_filename = '/' + gene + my_fname1 + '.csv'
|
||||||
|
outfile = outdir + out_filename
|
||||||
|
print(outfile) #<<<- check
|
||||||
|
|
||||||
# write to csv: without column or row names
|
# write to csv: without column or row names
|
||||||
# Bad practice: numbers at the start of a filename
|
# Bad practice: numbers at the start of a filename
|
||||||
snps_only.to_csv(output_filename, header = False, index = False)
|
snps_only.to_csv(out_filename, header = False, index = False)
|
||||||
|
|
||||||
#=========
|
#=========
|
||||||
# Step 12b: all snps with annotation
|
# Step 12b: all snps with annotation
|
||||||
|
@ -519,12 +498,23 @@ snps_only.to_csv(output_filename, header = False, index = False)
|
||||||
#my_fname2 = '_snps_with_metadata_'
|
#my_fname2 = '_snps_with_metadata_'
|
||||||
#nrows = len(pnca_snps_ALL)
|
#nrows = len(pnca_snps_ALL)
|
||||||
|
|
||||||
|
#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
|
||||||
#output_file_path = work_dir + '/Data/'
|
#output_file_path = work_dir + '/Data/'
|
||||||
#output_filename = output_file_path + gene + my_fname2 + str(nrows) + '.csv'
|
#output_file_path = data_dir + '/input/processed/' + drug + '/'
|
||||||
#print(output_filename) #<<<- check
|
print(outdir)
|
||||||
|
|
||||||
|
if not os.path.exists(outdir):
|
||||||
|
print( outdir, 'does not exist. Creating')
|
||||||
|
os.makedirs(outdir)
|
||||||
|
exit()
|
||||||
|
|
||||||
|
#out_filename = gene + my_fname2 + str(nrows) + '.csv'
|
||||||
|
out_filename = '/' + gene + my_fname1 + '.csv'
|
||||||
|
outfile = outdir + out_filename
|
||||||
|
print(outfile) #<<<- check
|
||||||
|
|
||||||
# write out file
|
# write out file
|
||||||
#pnca_snps_ALL.to_csv(output_filename, header = True, index = False)
|
#pnca_snps_ALL.to_csv(outfile, header = True, index = False)
|
||||||
|
|
||||||
#=========
|
#=========
|
||||||
# Step 12c: comp snps for OR calcs with annotation
|
# Step 12c: comp snps for OR calcs with annotation
|
||||||
|
@ -547,7 +537,7 @@ meta_pnca_LF2['mutation'].nunique()
|
||||||
meta_pnca_LF2.groupby('mutation_info').nunique()
|
meta_pnca_LF2.groupby('mutation_info').nunique()
|
||||||
|
|
||||||
# sanity check
|
# sanity check
|
||||||
meta_pnca_LF2['id'].nunique()
|
meta_pnca_LF2['id'].nunique() #1908
|
||||||
|
|
||||||
# should be True
|
# should be True
|
||||||
if meta_pnca_LF2['id'].nunique() == comp_pnca_samples:
|
if meta_pnca_LF2['id'].nunique() == comp_pnca_samples:
|
||||||
|
@ -569,15 +559,26 @@ len(pnca_snps_COMP)
|
||||||
|
|
||||||
gene = 'pnca'
|
gene = 'pnca'
|
||||||
#drug = 'pyrazinamide'
|
#drug = 'pyrazinamide'
|
||||||
my_fname3 = '_comp_snps_with_metadata_'
|
my_fname3 = '_comp_snps_with_metadata'
|
||||||
nrows = len(pnca_snps_COMP)
|
nrows = len(pnca_snps_COMP)
|
||||||
|
|
||||||
#output_filename = output_file_path + gene + my_fname3 + str(nrows) + '.csv'
|
#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
|
||||||
#print(output_filename) #<<<-check
|
#output_file_path = work_dir + '/Data/'
|
||||||
|
#output_file_path = data_dir + '/input/processed/' + drug + '/'
|
||||||
|
print(outdir)
|
||||||
|
|
||||||
|
if not os.path.exists(outdir):
|
||||||
|
print( outdir, 'does not exist. Creating')
|
||||||
|
os.makedirs(outdir)
|
||||||
|
exit()
|
||||||
|
|
||||||
|
#out_filename = gene + my_fname3 + str(nrows) + '.csv'
|
||||||
|
#out_filename = '/' + gene + my_fname3 + '.csv'
|
||||||
|
#outfile = outdir + out_filename
|
||||||
|
#print(outfile) #<<<- check
|
||||||
|
|
||||||
# write out file
|
# write out file
|
||||||
#pnca_snps_COMP.to_csv(output_filename, header = True, index = False)
|
#pnca_snps_COMP.to_csv(outfile, header = True, index = False)
|
||||||
|
|
||||||
|
|
||||||
#=========
|
#=========
|
||||||
# Step 12d: comp snps only
|
# Step 12d: comp snps only
|
||||||
|
@ -589,15 +590,26 @@ snps_only = pd.DataFrame(meta_pnca_LF2['Mutationinformation'].unique())
|
||||||
|
|
||||||
gene = 'pnca'
|
gene = 'pnca'
|
||||||
#drug = 'pyrazinamide'
|
#drug = 'pyrazinamide'
|
||||||
my_fname1 = '_comp_snps_'
|
my_fname1 = '_comp_snps'
|
||||||
nrows = len(snps_only)
|
nrows = len(snps_only)
|
||||||
|
|
||||||
output_filename = output_file_path + gene + my_fname1 + str(nrows) + '.csv'
|
#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
|
||||||
print(output_filename) #<<<- check
|
#output_file_path = work_dir + '/Data/'
|
||||||
|
#output_file_path = data_dir + '/input/processed/' + drug + '/'
|
||||||
|
print(outdir)
|
||||||
|
|
||||||
|
if not os.path.exists(outdir):
|
||||||
|
print( outdir, 'does not exist. Creating')
|
||||||
|
os.makedirs(outdir)
|
||||||
|
exit()
|
||||||
|
|
||||||
|
#out_filename = gene + my_fname1 + str(nrows) + '.csv'
|
||||||
|
out_filename = '/' + gene + my_fname1 + '.csv'
|
||||||
|
outfile = outdir + out_filename
|
||||||
|
print(outfile) #<<<- check
|
||||||
|
|
||||||
# write to csv: without column or row names
|
# write to csv: without column or row names
|
||||||
snps_only.to_csv(output_filename, header = False, index = False)
|
snps_only.to_csv(outfile, header = False, index = False)
|
||||||
|
|
||||||
|
|
||||||
#=#=#=#=#=#=#=#
|
#=#=#=#=#=#=#=#
|
||||||
# COMMENT: LF1 is the file to extract all unique snps for mcsm
|
# COMMENT: LF1 is the file to extract all unique snps for mcsm
|
||||||
|
@ -619,8 +631,21 @@ gene = 'pnca'
|
||||||
#drug = 'pyrazinamide'
|
#drug = 'pyrazinamide'
|
||||||
my_fname4 = '_metadata'
|
my_fname4 = '_metadata'
|
||||||
#nrows = len(meta_pnca_LF1)
|
#nrows = len(meta_pnca_LF1)
|
||||||
output_filename = output_file_path + gene + my_fname4 + '.csv'
|
|
||||||
print(output_filename) #<<<-check
|
#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
|
||||||
|
#output_file_path = work_dir + '/Data/'
|
||||||
|
#output_file_path = data_dir + '/input/processed/' + drug + '/'
|
||||||
|
print(outdir)
|
||||||
|
|
||||||
|
if not os.path.exists(outdir):
|
||||||
|
print( outdir, 'does not exist. Creating')
|
||||||
|
os.makedirs(outdir)
|
||||||
|
exit()
|
||||||
|
|
||||||
|
#out_filename = gene + my_fname1 + str(nrows) + '.csv'
|
||||||
|
out_filename = '/' + gene + my_fname4 + '.csv'
|
||||||
|
outfile = outdir + out_filename
|
||||||
|
print(outfile) #<<<- check
|
||||||
|
|
||||||
# write out file
|
# write out file
|
||||||
meta_pnca_LF1.to_csv(output_filename)
|
meta_pnca_LF1.to_csv(outfile)
|
||||||
|
|
|
@ -6,7 +6,7 @@ Created on Tue Jun 18 11:32:28 2019
|
||||||
@author: tanushree
|
@author: tanushree
|
||||||
"""
|
"""
|
||||||
############################################
|
############################################
|
||||||
#load libraries
|
# load libraries
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import os
|
import os
|
||||||
#############################################
|
#############################################
|
||||||
|
@ -17,14 +17,29 @@ import os
|
||||||
# containing GWAS data
|
# containing GWAS data
|
||||||
#!#########################!
|
#!#########################!
|
||||||
|
|
||||||
|
#print(os.getcwd())
|
||||||
|
#homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
|
||||||
|
#os.chdir(homedir + '/git/Data/pyrazinamide/input/original')
|
||||||
print(os.getcwd())
|
print(os.getcwd())
|
||||||
homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
|
|
||||||
os.chdir(homedir + '/git/Data/input/original')
|
#%%
|
||||||
print(os.getcwd())
|
############# specify variables for input and output paths and filenames
|
||||||
|
drug = 'pyrazinamide'
|
||||||
|
#gene = 'pnca'
|
||||||
|
|
||||||
|
datadir = homedir + '/git/Data'
|
||||||
|
basedir = datadir + '/' + drug + '/input'
|
||||||
|
|
||||||
|
# input
|
||||||
|
inpath = "/original"
|
||||||
|
in_filename = "/aa_codes.csv"
|
||||||
|
infile = basedir + inpath + in_filename
|
||||||
|
print(infile)
|
||||||
|
|
||||||
#==========
|
#==========
|
||||||
#read file
|
#read file
|
||||||
#==========
|
#==========
|
||||||
my_aa = pd.read_csv('aa_codes.csv') #20, 6
|
my_aa = pd.read_csv(infile) #20, 6
|
||||||
#assign the one_letter code as the row names so that it is easier to create a dict of dicts using index
|
#assign the one_letter code as the row names so that it is easier to create a dict of dicts using index
|
||||||
#my_aa = pd.read_csv('aa_codes.csv', index_col = 0) #20, 6 #a way to it since it is the first column
|
#my_aa = pd.read_csv('aa_codes.csv', index_col = 0) #20, 6 #a way to it since it is the first column
|
||||||
my_aa = my_aa.set_index('three_letter_code_lower') #20, 5
|
my_aa = my_aa.set_index('three_letter_code_lower') #20, 5
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue