651 lines
21 KiB
Python
Executable file
651 lines
21 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
Created on Tue Aug 6 12:56:03 2019
|
|
|
|
@author: tanu
|
|
"""
|
|
|
|
# FIXME: include error checking to enure you only
|
|
# concentrate on positions that have structural info?
|
|
|
|
#%% load libraries
|
|
###################
|
|
# load libraries
|
|
import os, sys
|
|
import pandas as pd
|
|
#import numpy as np
|
|
|
|
#from pandas.api.types import is_string_dtype
|
|
#from pandas.api.types import is_numeric_dtype
|
|
|
|
# to create dir
|
|
#my_dir = os.path.expanduser('~/some_dir')
|
|
#make sure mcsm_analysis/ exists
|
|
#or specify the output directory
|
|
#====================================================
|
|
# TASK: extract ALL pncA mutations from GWAS data
|
|
#========================================================
|
|
#%%
|
|
####################
|
|
# my working dir
|
|
os.getcwd()
|
|
homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
|
|
os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
|
|
os.getcwd()
|
|
#%%
|
|
from reference_dict import my_aa_dict #CHECK DIR STRUC THERE!
|
|
#%%
|
|
############# specify variables for input and output paths and filenames
|
|
drug = "pyrazinamide"
|
|
gene = "pnca"
|
|
|
|
datadir = homedir + "/git/Data"
|
|
basedir = datadir + "/" + drug + "/input"
|
|
|
|
# input
|
|
inpath = "/original"
|
|
in_filename = "/original_tanushree_data_v2.csv"
|
|
infile = basedir + inpath + in_filename
|
|
#print(infile)
|
|
|
|
# output: several output files
|
|
# output filenames in respective sections at the time of outputting files
|
|
outpath = "/processed"
|
|
outdir = basedir + outpath
|
|
#print(outdir)
|
|
|
|
if not os.path.exists(datadir):
|
|
print('Error!', datadir, 'does not exist. Please ensure it exists. Dir struc specified in README.md')
|
|
os.makedirs(datadir)
|
|
die()
|
|
|
|
if not os.path.exists(outdir):
|
|
print('Error!', outdir, 'does not exist.Please ensure it exists. Dir struc specified in README.md')
|
|
exit()
|
|
|
|
else:
|
|
print('Dir exists: Carrying on')
|
|
|
|
################## end of variable assignment for input and output files
|
|
#%%
|
|
#==============================================================================
|
|
############
|
|
# STEP 1: Read file original_tanushree_data_v2.csv
|
|
############
|
|
#data_file = data_dir + '/input/original/original_tanushree_data_v2.csv'
|
|
meta_data = pd.read_csv(infile, sep = ',')
|
|
|
|
# column names
|
|
list(meta_data.columns)
|
|
|
|
# extract elevant columns to extract from meta data related to the pyrazinamide
|
|
meta_data = meta_data[['id'
|
|
,'country'
|
|
,'lineage'
|
|
,'sublineage'
|
|
,'drtype'
|
|
, 'pyrazinamide'
|
|
, 'dr_mutations_pyrazinamide'
|
|
, 'other_mutations_pyrazinamide'
|
|
]] #19265, 67
|
|
|
|
# checks
|
|
total_samples = meta_data['id'].nunique() # 19265
|
|
|
|
# counts NA per column
|
|
meta_data.isna().sum()
|
|
|
|
# glance
|
|
meta_data.head()
|
|
|
|
# equivalent of table in R
|
|
# pyrazinamide counts
|
|
meta_data.pyrazinamide.value_counts() #12511
|
|
#0.0 10565
|
|
#1.0 1946 {RESULT: No. of Resistant and Susceptible samples}
|
|
|
|
clear variables
|
|
#del(basedir, datadir, inpath, infile)
|
|
|
|
#%%
|
|
############
|
|
# STEP 2: extract entries containing selected genes:
|
|
# pyrazinamide: pnca_p.
|
|
# in the dr_mutations and other mutations"
|
|
# as we are interested in the mutations in the protein coding region only
|
|
# (corresponding to a structure)
|
|
# and drop the entries with NA
|
|
#############
|
|
meta_pza = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
|
|
#2163 {RESULT: samples with dr_muts}
|
|
dr_id = meta_pza['id'].unique()
|
|
|
|
meta_pza = meta_data.loc[meta_data.other_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
|
|
#526 (RESULT: samples with other_muts)
|
|
other_id = meta_pza['id'].unique()
|
|
|
|
# FIXME: See if the sample ids are unique in each
|
|
# find any common IDs
|
|
dr_id.isin(other_id[1,1])
|
|
|
|
del(meta_pza)
|
|
|
|
##########################
|
|
# pyrazinamide: pnca_p.
|
|
##########################
|
|
meta_data_pnca = meta_data[['id'
|
|
,'country'
|
|
,'lineage'
|
|
,'sublineage'
|
|
,'drtype'
|
|
, 'pyrazinamide'
|
|
, 'dr_mutations_pyrazinamide'
|
|
, 'other_mutations_pyrazinamide'
|
|
]]
|
|
|
|
del(meta_data)
|
|
|
|
# sanity checks
|
|
|
|
# dr_mutations only
|
|
meta_pnca_dr = meta_data_pnca.loc[meta_data_pnca.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
|
|
meta_pnca_dr['id'].nunique()
|
|
del(meta_pnca_dr)
|
|
|
|
# other mutations
|
|
meta_pnca_other = meta_data_pnca.loc[meta_data_pnca.other_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
|
|
meta_pnca_other['id'].nunique()
|
|
del(meta_pnca_other)
|
|
|
|
# Now extract "all" mutations
|
|
meta_pnca_all = meta_data_pnca.loc[meta_data_pnca.dr_mutations_pyrazinamide.str.contains('pncA_p.*') | meta_data_pnca.other_mutations_pyrazinamide.str.contains('pncA_p.*') ]
|
|
#2665, 8
|
|
|
|
meta_pnca_all['id'].nunique() {#RESULT: pnca mutations in ALL samples}
|
|
pnca_samples = len(meta_pnca_all)
|
|
pnca_na = meta_pnca_all['pyrazinamide'].isna().sum()
|
|
comp_pnca_samples = pnca_samples - pnca_na
|
|
|
|
#=#=#=#=#=#=#
|
|
# COMMENT: use it later to check number of complete samples from LF data
|
|
#=#=#=#=#=#=#
|
|
|
|
# sanity checks
|
|
meta_pnca_all.dr_mutations_pyrazinamide.value_counts()
|
|
meta_pnca_all.other_mutations_pyrazinamide.value_counts()
|
|
|
|
# more sanity checks
|
|
# !CAUTION!: muts will change depending on your gene
|
|
|
|
# dr muts : insert
|
|
meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.Gln10Pro')] #
|
|
meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.Phe106Leu')] # empty
|
|
meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.Val139Leu')]
|
|
|
|
meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.')] # exists # rows
|
|
m = meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.')] # exists # rows
|
|
|
|
# other_muts
|
|
meta_pnca_all.loc[meta_pnca_all.other_mutations_pyrazinamide.str.contains('pncA_p.Gln10Pro*')] # empty
|
|
meta_pnca_all.loc[meta_pnca_all.other_mutations_pyrazinamide.str.contains('pncA_p.Phe106Leu')]
|
|
|
|
#=#=#=#=#=#=#=#=#=#
|
|
# FIXME
|
|
# COMMENTS: both mutations columns are separated by ;
|
|
# CHECK if there are mutations that exist both in dr and other_muts!
|
|
# doesn't make any sense for the same mut to exist in both, I would have thought!
|
|
#=#=#=#=#=#=#=#=#=#
|
|
|
|
# remove not required variables
|
|
del(meta_data_pnca)
|
|
|
|
############
|
|
# STEP 3: split the columns of
|
|
# a) dr_mutation_... (;) as
|
|
# the column has snps related to multiple genes.
|
|
# useful links
|
|
# https://stackoverflow.com/questions/17116814/pandas-how-do-i-split-text-in-a-column-into-multiple-rows
|
|
# this one works beautifully
|
|
# https://stackoverflow.com/questions/12680754/split-explode-pandas-dataframe-string-entry-to-separate-rows
|
|
############
|
|
|
|
# sanity check: counts NA per column afer subsetted df: i.e in meta_pza(with pncA_p. extracted mutations)
|
|
meta_pnca_all.isna().sum()
|
|
|
|
#=#=#=#=#=#=#=#=#=#
|
|
# COMMENT: no NA's in dr_mutations/other_mutations_columns
|
|
#=#=#=#=#=#=#=#=#=#
|
|
# define the split function
|
|
def tidy_split(df, column, sep='|', keep=False):
|
|
"""
|
|
Split the values of a column and expand so the new DataFrame has one split
|
|
value per row. Filters rows where the column is missing.
|
|
|
|
Params
|
|
------
|
|
df : pandas.DataFrame
|
|
dataframe with the column to split and expand
|
|
column : str
|
|
the column to split and expand
|
|
sep : str
|
|
the string used to split the column's values
|
|
keep : bool
|
|
whether to retain the presplit value as it's own row
|
|
|
|
Returns
|
|
-------
|
|
pandas.DataFrame
|
|
Returns a dataframe with the same columns as `df`.
|
|
"""
|
|
indexes = list()
|
|
new_values = list()
|
|
#df = df.dropna(subset=[column])#<<<<<<-----see this incase you need to uncomment based on use case
|
|
for i, presplit in enumerate(df[column].astype(str)):
|
|
values = presplit.split(sep)
|
|
if keep and len(values) > 1:
|
|
indexes.append(i)
|
|
new_values.append(presplit)
|
|
for value in values:
|
|
indexes.append(i)
|
|
new_values.append(value)
|
|
new_df = df.iloc[indexes, :].copy()
|
|
new_df[column] = new_values
|
|
return new_df
|
|
|
|
########
|
|
# 3a: call tidy_split() on 'dr_mutations_pyrazinamide' column and remove leading white spaces
|
|
#https://stackoverflow.com/questions/41476150/removing-space-from-dataframe-columns-in-pandas
|
|
########
|
|
meta_pnca_WF0 = tidy_split(meta_pnca_all, 'dr_mutations_pyrazinamide', sep = ';')
|
|
|
|
# remove leading white space else these are counted as distinct mutations as well
|
|
meta_pnca_WF0['dr_mutations_pyrazinamide'] = meta_pnca_WF0['dr_mutations_pyrazinamide'].str.lstrip()
|
|
|
|
########
|
|
# 3b: call function on 'other_mutations_pyrazinamide' column and remove leading white spaces
|
|
########
|
|
meta_pnca_WF1 = tidy_split(meta_pnca_WF0, 'other_mutations_pyrazinamide', sep = ';')
|
|
|
|
# remove the leading white spaces in the column
|
|
meta_pnca_WF1['other_mutations_pyrazinamide'] = meta_pnca_WF1['other_mutations_pyrazinamide'].str.strip()
|
|
|
|
##########
|
|
# Step 4: Reshape data so that all mutations are in one column and the
|
|
# annotations for the mutation reflect the column name
|
|
# LINK: http://www.datasciencemadesimple.com/reshape-wide-long-pandas-python-melt-function/
|
|
|
|
# data frame “df” is passed to melt() function
|
|
# id_vars is the variable which need to be left unaltered
|
|
# var_name are the column names so we named it as 'mutation_info'
|
|
# value_name are its values so we named it as 'mutation'
|
|
##########
|
|
meta_pnca_WF1.columns
|
|
|
|
meta_pnca_LF0 = pd.melt(meta_pnca_WF1
|
|
, id_vars = ['id', 'country', 'lineage', 'sublineage', 'drtype', 'pyrazinamide']
|
|
, var_name = 'mutation_info'
|
|
, value_name = 'mutation')
|
|
|
|
# sanity check: should be true
|
|
if len(meta_pnca_LF0) == len(meta_pnca_WF1)*2:
|
|
print('sanity check passed: Long format df has the expected length')
|
|
else:
|
|
print("Sanity check failed: Debug please!")
|
|
|
|
###########
|
|
# Step 5: This is still dirty data. Filter LF data so that you only have
|
|
# mutations corresponding to pnca_p.
|
|
# this will be your list you run OR calcs
|
|
###########
|
|
meta_pnca_LF1 = meta_pnca_LF0[meta_pnca_LF0['mutation'].str.contains('pncA_p.*')]
|
|
|
|
# sanity checks
|
|
# unique samples
|
|
meta_pnca_LF1['id'].nunique()
|
|
if len(meta_pnca_all) == meta_pnca_LF1['id'].nunique():
|
|
print("Sanity check passed: No of samples with pncA mutations match")
|
|
else:
|
|
print("Sanity check failed: Debug please!")
|
|
|
|
# count if all the mutations are indeed in the protein coding region
|
|
# i.e begin with pnca_p
|
|
meta_pnca_LF1['mutation'].str.count('pncA_p.').sum() # 3093
|
|
|
|
# should be true.
|
|
# and check against the length of the df, which should match
|
|
if len(meta_pnca_LF1) == meta_pnca_LF1['mutation'].str.count('pncA_p.').sum():
|
|
print("Sanity check passed: Long format data containing pnca mutations indeed correspond to pncA_p region")
|
|
else:
|
|
print("Sanity check failed: Debug please!")
|
|
|
|
###########
|
|
# Step 6: Filter dataframe with "na" in the drug column
|
|
# This is because for OR, you can't use the snps that have the
|
|
# NA in the specified drug column
|
|
# it creates problems when performing calcs in R inside the loop
|
|
# so best to filter it here
|
|
###########
|
|
# NOT NEEDED FOR all snps, only for extracting valid OR snps
|
|
del (meta_pnca_WF0, meta_pnca_WF1, meta_pnca_LF0, meta_pnca_all)
|
|
|
|
###########
|
|
# Step 7: count unique pncA_p mutations (all and comp cases)
|
|
###########
|
|
meta_pnca_LF1['mutation'].nunique()
|
|
meta_pnca_LF1.groupby('mutation_info').nunique()
|
|
|
|
meta_pnca_LF1['id'].nunique()
|
|
meta_pnca_LF1['mutation'].nunique()
|
|
meta_pnca_LF1.groupby('id').nunique()
|
|
|
|
###########
|
|
# Step 8: convert all snps only (IN LOWERCASE)
|
|
# because my mcsm file intergated has lowercase
|
|
###########
|
|
# convert mutation to lower case as it needs to exactly match the dict key
|
|
#meta_pnca_LF1['mutation'] = meta_pnca_LF1.mutation.str.lower() # WARNINGS: suggested to use .loc
|
|
meta_pnca_LF1['mutation'] = meta_pnca_LF1.loc[:, 'mutation'].str.lower()
|
|
|
|
###########
|
|
# Step 9 : Split 'mutation' column into three: wild_type, position and
|
|
# mutant_type separately. Then map three letter code to one from the
|
|
# referece_dict imported pncaeady. First convert to mutation to lowercase
|
|
# to allow to match entries from dict
|
|
###########
|
|
#=======
|
|
# Step 9a: iterate through the dict, create a lookup dict i.e
|
|
# lookup_dict = {three_letter_code: one_letter_code}.
|
|
# lookup dict should be the key and the value (you want to create a column for)
|
|
# Then use this to perform the mapping separetly for wild type and mutant cols.
|
|
# The three letter code is extracted using a regex match from the dataframe and then converted
|
|
# to 'pandas series'since map only works in pandas series
|
|
#=======
|
|
# initialise a sub dict that is a lookup dict for three letter code to one
|
|
lookup_dict = dict()
|
|
for k, v in my_aa_dict.items():
|
|
lookup_dict[k] = v['one_letter_code']
|
|
wt = meta_pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
|
|
meta_pnca_LF1['wild_type'] = wt.map(lookup_dict)
|
|
mut = meta_pnca_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
|
|
meta_pnca_LF1['mutant_type'] = mut.map(lookup_dict)
|
|
|
|
# extract position info from mutation column separetly using regex
|
|
meta_pnca_LF1['position'] = meta_pnca_LF1['mutation'].str.extract(r'(\d+)')
|
|
|
|
# clear variables
|
|
del(k, v, wt, mut, lookup_dict)
|
|
|
|
#=========
|
|
# Step 9b: iterate through the dict, create a lookup dict that i.e
|
|
# lookup_dict = {three_letter_code: aa_prop_water}
|
|
# Do this for both wild_type and mutant as above.
|
|
#=========
|
|
# initialise a sub dict that is lookup dict for three letter code to aa prop
|
|
lookup_dict = dict()
|
|
|
|
for k, v in my_aa_dict.items():
|
|
lookup_dict[k] = v['aa_prop_water']
|
|
#print(lookup_dict)
|
|
wt = meta_pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
|
|
meta_pnca_LF1['wt_prop_water'] = wt.map(lookup_dict)
|
|
mut = meta_pnca_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
|
|
meta_pnca_LF1['mut_prop_water'] = mut.map(lookup_dict)
|
|
|
|
# added two more cols
|
|
|
|
# clear variables
|
|
del(k, v, wt, mut, lookup_dict)
|
|
|
|
#========
|
|
# Step 9c: iterate through the dict, create a lookup dict that i.e
|
|
# lookup_dict = {three_letter_code: aa_prop_polarity}
|
|
# Do this for both wild_type and mutant as above.
|
|
#=========
|
|
# initialise a sub dict that is lookup dict for three letter code to aa prop
|
|
lookup_dict = dict()
|
|
|
|
for k, v in my_aa_dict.items():
|
|
lookup_dict[k] = v['aa_prop_polarity']
|
|
#print(lookup_dict)
|
|
wt = meta_pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
|
|
meta_pnca_LF1['wt_prop_polarity'] = wt.map(lookup_dict)
|
|
mut = meta_pnca_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze()
|
|
meta_pnca_LF1['mut_prop_polarity'] = mut.map(lookup_dict)
|
|
|
|
# added two more cols
|
|
|
|
# clear variables
|
|
del(k, v, wt, mut, lookup_dict)
|
|
|
|
########
|
|
# Step 10: combine the wild_type+poistion+mutant_type columns to generate
|
|
# Mutationinformation (matches mCSM output field)
|
|
# Remember to use .map(str) for int col types to allow string concatenation
|
|
#########
|
|
meta_pnca_LF1['Mutationinformation'] = meta_pnca_LF1['wild_type'] + meta_pnca_LF1.position.map(str) + meta_pnca_LF1['mutant_type']
|
|
|
|
#=#=#=#=#=#=#
|
|
# Step 11:
|
|
# COMMENT: there is more processing in the older version of this script
|
|
# consult if necessary
|
|
# possibly due to the presence of true_wt
|
|
# since this file doesn't contain any true_wt, we won't need it(hopefully!)
|
|
#=#=#=#=#=#=#
|
|
|
|
#%%
|
|
###########
|
|
# Step 12: Output files for only SNPs to run mCSM
|
|
###########
|
|
|
|
#=========
|
|
# Step 12a: all SNPs to run mCSM
|
|
#=========
|
|
snps_only = pd.DataFrame(meta_pnca_LF1['Mutationinformation'].unique()) #336
|
|
pos_only = pd.DataFrame(meta_pnca_LF1['position'].unique()) #131
|
|
|
|
# since 186 is not part of struc: it is one less
|
|
#FIXME!
|
|
|
|
# assign meaningful colnames
|
|
#snps_only.rename({0 : 'all_pnca_snps'}, axis = 1, inplace = True)
|
|
#list(snps_only.columns)
|
|
snps_only.isna().sum() # should be 0
|
|
|
|
# output csv: all SNPS for mCSM analysis
|
|
# specify variable name for output file
|
|
gene = 'pnca'
|
|
#drug = 'pyrazinamide'
|
|
my_fname1 = '_snps'
|
|
nrows = len(snps_only)
|
|
|
|
#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
|
|
#output_file_path = work_dir + '/Data/'
|
|
#output_file_path = data_dir + '/input/processed/' + drug + '/'
|
|
print(outdir)
|
|
|
|
if not os.path.exists(outdir):
|
|
print( outdir, 'does not exist. Creating')
|
|
os.makedirs(outdir)
|
|
exit()
|
|
|
|
#out_filename = gene + my_fname1 + str(nrows) + '.csv'
|
|
out_filename = '/' + gene + my_fname1 + '.csv'
|
|
outfile = outdir + out_filename
|
|
print(outfile) #<<<- check
|
|
|
|
# write to csv: without column or row names
|
|
# Bad practice: numbers at the start of a filename
|
|
snps_only.to_csv(out_filename, header = False, index = False)
|
|
|
|
#=========
|
|
# Step 12b: all snps with annotation
|
|
#=========
|
|
# all snps, selected cols
|
|
#pnca_snps_ALL = meta_pnca_LF1[['id','country','lineage', 'sublineage'
|
|
# , 'drtype', 'pyrazinamide'
|
|
# , 'mutation_info', 'mutation', 'Mutationinformation']]
|
|
|
|
#len(pnca_snps_ALL)
|
|
|
|
# sanity check
|
|
#meta_pnca_LF1['mutation'].nunique()
|
|
|
|
# output csv: WITH column but WITHOUT row names(all snps with meta data)
|
|
# specify variable name for output file
|
|
#gene = 'pnca'
|
|
#drug = 'pyrazinamide'
|
|
#my_fname2 = '_snps_with_metadata_'
|
|
#nrows = len(pnca_snps_ALL)
|
|
|
|
#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
|
|
#output_file_path = work_dir + '/Data/'
|
|
#output_file_path = data_dir + '/input/processed/' + drug + '/'
|
|
print(outdir)
|
|
|
|
if not os.path.exists(outdir):
|
|
print( outdir, 'does not exist. Creating')
|
|
os.makedirs(outdir)
|
|
exit()
|
|
|
|
#out_filename = gene + my_fname2 + str(nrows) + '.csv'
|
|
out_filename = '/' + gene + my_fname1 + '.csv'
|
|
outfile = outdir + out_filename
|
|
print(outfile) #<<<- check
|
|
|
|
# write out file
|
|
#pnca_snps_ALL.to_csv(outfile, header = True, index = False)
|
|
|
|
#=========
|
|
# Step 12c: comp snps for OR calcs with annotation
|
|
#=========
|
|
# remove all NA's from pyrazinamide column from LF1
|
|
|
|
# counts NA per column
|
|
meta_pnca_LF1.isna().sum()
|
|
|
|
# remove NA
|
|
meta_pnca_LF2 = meta_pnca_LF1.dropna(subset=['pyrazinamide'])
|
|
|
|
# sanity checks
|
|
# should be True
|
|
len(meta_pnca_LF2) == len(meta_pnca_LF1) - meta_pnca_LF1['pyrazinamide'].isna().sum()
|
|
|
|
# unique counts
|
|
meta_pnca_LF2['mutation'].nunique()
|
|
|
|
meta_pnca_LF2.groupby('mutation_info').nunique()
|
|
|
|
# sanity check
|
|
meta_pnca_LF2['id'].nunique() #1908
|
|
|
|
# should be True
|
|
if meta_pnca_LF2['id'].nunique() == comp_pnca_samples:
|
|
print ('sanity check passed: complete numbers match')
|
|
else:
|
|
print('Error: Please Debug!')
|
|
|
|
# value counts
|
|
meta_pnca_LF2.mutation.value_counts()
|
|
#meta_pnca_LF2.groupby(['mutation_info', 'mutation']).size()
|
|
|
|
# valid/comp snps
|
|
# uncomment as necessary
|
|
pnca_snps_COMP = pd.DataFrame(meta_pnca_LF2['Mutationinformation'].unique())
|
|
len(pnca_snps_COMP)
|
|
|
|
# output csv: WITH column but WITHOUT row names (COMP snps with meta data)
|
|
# specify variable name for output file
|
|
|
|
gene = 'pnca'
|
|
#drug = 'pyrazinamide'
|
|
my_fname3 = '_comp_snps_with_metadata'
|
|
nrows = len(pnca_snps_COMP)
|
|
|
|
#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
|
|
#output_file_path = work_dir + '/Data/'
|
|
#output_file_path = data_dir + '/input/processed/' + drug + '/'
|
|
print(outdir)
|
|
|
|
if not os.path.exists(outdir):
|
|
print( outdir, 'does not exist. Creating')
|
|
os.makedirs(outdir)
|
|
exit()
|
|
|
|
#out_filename = gene + my_fname3 + str(nrows) + '.csv'
|
|
#out_filename = '/' + gene + my_fname3 + '.csv'
|
|
#outfile = outdir + out_filename
|
|
#print(outfile) #<<<- check
|
|
|
|
# write out file
|
|
#pnca_snps_COMP.to_csv(outfile, header = True, index = False)
|
|
|
|
#=========
|
|
# Step 12d: comp snps only
|
|
#=========
|
|
# output csv: comp SNPS for info (i.e snps for which OR exist)
|
|
# specify variable name for output file
|
|
|
|
snps_only = pd.DataFrame(meta_pnca_LF2['Mutationinformation'].unique())
|
|
|
|
gene = 'pnca'
|
|
#drug = 'pyrazinamide'
|
|
my_fname1 = '_comp_snps'
|
|
nrows = len(snps_only)
|
|
|
|
#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
|
|
#output_file_path = work_dir + '/Data/'
|
|
#output_file_path = data_dir + '/input/processed/' + drug + '/'
|
|
print(outdir)
|
|
|
|
if not os.path.exists(outdir):
|
|
print( outdir, 'does not exist. Creating')
|
|
os.makedirs(outdir)
|
|
exit()
|
|
|
|
#out_filename = gene + my_fname1 + str(nrows) + '.csv'
|
|
out_filename = '/' + gene + my_fname1 + '.csv'
|
|
outfile = outdir + out_filename
|
|
print(outfile) #<<<- check
|
|
|
|
# write to csv: without column or row names
|
|
snps_only.to_csv(outfile, header = False, index = False)
|
|
|
|
#=#=#=#=#=#=#=#
|
|
# COMMENT: LF1 is the file to extract all unique snps for mcsm
|
|
# but you have that already in file called pnca_snps...
|
|
# LF2: is the file for extracting snps tested for DS and hence OR calcs
|
|
#=#=#=#=#=#=#=#
|
|
|
|
###########
|
|
# Step 13 : Output the whole df i.e
|
|
# file for meta_data which is now formatted with
|
|
# each row as a unique snp rather than the original version where
|
|
# each row is a unique id
|
|
# ***** This is the file you will ADD the AF and OR calculations to *****
|
|
###########
|
|
|
|
# output csv: the entire DF
|
|
# specify variable name for output file
|
|
gene = 'pnca'
|
|
#drug = 'pyrazinamide'
|
|
my_fname4 = '_metadata'
|
|
#nrows = len(meta_pnca_LF1)
|
|
|
|
#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
|
|
#output_file_path = work_dir + '/Data/'
|
|
#output_file_path = data_dir + '/input/processed/' + drug + '/'
|
|
print(outdir)
|
|
|
|
if not os.path.exists(outdir):
|
|
print( outdir, 'does not exist. Creating')
|
|
os.makedirs(outdir)
|
|
exit()
|
|
|
|
#out_filename = gene + my_fname1 + str(nrows) + '.csv'
|
|
out_filename = '/' + gene + my_fname4 + '.csv'
|
|
outfile = outdir + out_filename
|
|
print(outfile) #<<<- check
|
|
|
|
# write out file
|
|
meta_pnca_LF1.to_csv(outfile)
|