work from thinkpad
This commit is contained in:
parent
4d03a43c4a
commit
660ab31ce8
2 changed files with 86 additions and 65 deletions
|
@ -81,8 +81,8 @@ indir = args.input_dir
|
||||||
outdir = args.output_dir
|
outdir = args.output_dir
|
||||||
make_dirs = args.make_dirs
|
make_dirs = args.make_dirs
|
||||||
|
|
||||||
#drug = 'ethambutol'
|
#drug = 'streptomycin'
|
||||||
#gene = 'embB'
|
#gene = 'gid'
|
||||||
|
|
||||||
#%% input and output dirs and files
|
#%% input and output dirs and files
|
||||||
#=======
|
#=======
|
||||||
|
@ -122,15 +122,15 @@ if make_dirs:
|
||||||
# handle missing dirs here
|
# handle missing dirs here
|
||||||
if not os.path.isdir(datadir):
|
if not os.path.isdir(datadir):
|
||||||
print('ERROR: Data directory does not exist:', datadir
|
print('ERROR: Data directory does not exist:', datadir
|
||||||
, '\nPlease create and ensure gwas data is present and then rerun\nelse specify cmd option ---make_dirs')
|
, '\nPlease create and ensure gwas data is present and then rerun\nelse specify cmd option --make_dirs')
|
||||||
sys.exit()
|
sys.exit()
|
||||||
if not os.path.isdir(indir):
|
if not os.path.isdir(indir):
|
||||||
print('ERROR: Input directory does not exist:', indir
|
print('ERROR: Input directory does not exist:', indir
|
||||||
, '\nPlease either create or specify indir and rerun\nelse specify cmd option ---make_dirs')
|
, '\nPlease either create or specify indir and rerun\nelse specify cmd option --make_dirs')
|
||||||
sys.exit()
|
sys.exit()
|
||||||
if not os.path.isdir(outdir):
|
if not os.path.isdir(outdir):
|
||||||
print('ERROR: Output directory does not exist:', outdir
|
print('ERROR: Output directory does not exist:', outdir
|
||||||
, '\nPlease create or specify outdir and rerun\nelse specify cmd option ---make_dirs')
|
, '\nPlease create or specify outdir and rerun\nelse specify cmd option --make_dirs')
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
||||||
# Requires
|
# Requires
|
||||||
|
@ -317,7 +317,7 @@ for i, id in enumerate(clean_df.id):
|
||||||
print('RESULTS:')
|
print('RESULTS:')
|
||||||
print('Total WT in dr_muts_col:', wt)
|
print('Total WT in dr_muts_col:', wt)
|
||||||
print('Total matches of', gene, 'SNP matches in', dr_muts_col, ':', dr_gene_count)
|
print('Total matches of', gene, 'SNP matches in', dr_muts_col, ':', dr_gene_count)
|
||||||
print('Total samples with > 1', gene, 'muts in dr_muts_col:', len(id2_dr) )
|
print('Total samples with > 1', gene, 'nsSNPs in dr_muts_col:', len(id2_dr) )
|
||||||
print('=================================================================')
|
print('=================================================================')
|
||||||
|
|
||||||
del(clean_df, na_count, i, id, wt, id2_dr, count_gene_dr, count_wt)
|
del(clean_df, na_count, i, id, wt, id2_dr, count_gene_dr, count_wt)
|
||||||
|
@ -361,7 +361,7 @@ for i, id in enumerate(clean_df.id):
|
||||||
print('RESULTS:')
|
print('RESULTS:')
|
||||||
print('Total WT in other_muts_col:', wt_other)
|
print('Total WT in other_muts_col:', wt_other)
|
||||||
print('Total matches of', gene, 'SNP matches in', other_muts_col, ':', other_gene_count)
|
print('Total matches of', gene, 'SNP matches in', other_muts_col, ':', other_gene_count)
|
||||||
print('Total samples with > 1', gene, 'muts in other_muts_col:', len(id2_other) )
|
print('Total samples with > 1', gene, 'nsSNPs in other_muts_col:', len(id2_other) )
|
||||||
print('=================================================================')
|
print('=================================================================')
|
||||||
|
|
||||||
print('Predicting total no. of rows in the curated df:', dr_gene_count + other_gene_count
|
print('Predicting total no. of rows in the curated df:', dr_gene_count + other_gene_count
|
||||||
|
@ -851,7 +851,7 @@ else:
|
||||||
, '\nMuts are unique to dr_ and other_ mutation class'
|
, '\nMuts are unique to dr_ and other_ mutation class'
|
||||||
, '\n=========================================================')
|
, '\n=========================================================')
|
||||||
|
|
||||||
# inspect dr_muts and other muts
|
# inspect dr_muts and other muts: Fixed in case no ambiguous muts detected!
|
||||||
if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
|
if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
|
||||||
print('Finding ambiguous muts...'
|
print('Finding ambiguous muts...'
|
||||||
, '\n========================================================='
|
, '\n========================================================='
|
||||||
|
@ -861,24 +861,37 @@ if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
|
||||||
, '\nTotal no. of samples in other_muts present in dr_muts:', other_muts.isin(dr_muts).sum()
|
, '\nTotal no. of samples in other_muts present in dr_muts:', other_muts.isin(dr_muts).sum()
|
||||||
, '\nThese are:\n', other_muts[other_muts.isin(dr_muts)]
|
, '\nThese are:\n', other_muts[other_muts.isin(dr_muts)]
|
||||||
, '\n=========================================================')
|
, '\n=========================================================')
|
||||||
else:
|
|
||||||
sys.exit('Error: ambiguous muts present, but extraction failed. Debug!')
|
|
||||||
|
|
||||||
print('Counting no. of ambiguous muts...')
|
print('Counting no. of ambiguous muts...'
|
||||||
|
|
||||||
if dr_muts[dr_muts.isin(other_muts)].nunique() == other_muts[other_muts.isin(dr_muts)].nunique():
|
|
||||||
common_muts = dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist()
|
|
||||||
print('Distinct no. of ambigiuous muts detected:'+ str(len(common_muts))
|
|
||||||
, '\nlist of ambiguous mutations (see below):', *common_muts, sep = '\n')
|
|
||||||
print('\n===========================================================')
|
|
||||||
else:
|
|
||||||
print('Error: ambiguous muts detected, but extraction failed. Debug!'
|
|
||||||
, '\nNo. of ambiguous muts in dr:'
|
, '\nNo. of ambiguous muts in dr:'
|
||||||
, len(dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist())
|
, len(dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist())
|
||||||
, '\nNo. of ambiguous muts in other:'
|
, '\nNo. of ambiguous muts in other:'
|
||||||
, len(other_muts[other_muts.isin(dr_muts)].value_counts().keys().tolist())
|
, len(other_muts[other_muts.isin(dr_muts)].value_counts().keys().tolist())
|
||||||
, '\n=========================================================')
|
, '\n=========================================================')
|
||||||
|
|
||||||
|
if dr_muts[dr_muts.isin(other_muts)].nunique() == other_muts[other_muts.isin(dr_muts)].nunique():
|
||||||
|
common_muts = dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist()
|
||||||
|
print('Distinct no. of ambigiuous muts detected:'+ str(len(common_muts))
|
||||||
|
, '\nlist of ambiguous mutations (see below):', *common_muts, sep = '\n')
|
||||||
|
print('\n===========================================================')
|
||||||
|
else:
|
||||||
|
#sys.exit('Error: ambiguous muts present, but extraction failed. Debug!')
|
||||||
|
print('No: ambiguous muts present')
|
||||||
|
|
||||||
|
#print('Counting no. of ambiguous muts...')
|
||||||
|
#if dr_muts[dr_muts.isin(other_muts)].nunique() == other_muts[other_muts.isin(dr_muts)].nunique():
|
||||||
|
# common_muts = dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist()
|
||||||
|
# print('Distinct no. of ambigiuous muts detected:'+ str(len(common_muts))
|
||||||
|
# , '\nlist of ambiguous mutations (see below):', *common_muts, sep = '\n')
|
||||||
|
# print('\n===========================================================')
|
||||||
|
#else:
|
||||||
|
# print('Error: ambiguous muts detected, but extraction failed. Debug!'
|
||||||
|
# , '\nNo. of ambiguous muts in dr:'
|
||||||
|
# , len(dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist())
|
||||||
|
# , '\nNo. of ambiguous muts in other:'
|
||||||
|
# , len(other_muts[other_muts.isin(dr_muts)].value_counts().keys().tolist())
|
||||||
|
# , '\n=========================================================')
|
||||||
|
|
||||||
#%% clear variables
|
#%% clear variables
|
||||||
del(id_dr, id_other
|
del(id_dr, id_other
|
||||||
#, meta_data
|
#, meta_data
|
||||||
|
@ -893,25 +906,24 @@ del(c1, c2, col_to_split1, col_to_split2, comp_gene_samples, dr_WF0, dr_df, dr_m
|
||||||
#print(outdir)
|
#print(outdir)
|
||||||
#dr_muts.to_csv('dr_muts.csv', header = True)
|
#dr_muts.to_csv('dr_muts.csv', header = True)
|
||||||
#other_muts.to_csv('other_muts.csv', header = True)
|
#other_muts.to_csv('other_muts.csv', header = True)
|
||||||
|
if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
|
||||||
out_filename_ambig_muts = gene.lower() + '_ambiguous_muts.csv'
|
out_filename_ambig_muts = gene.lower() + '_ambiguous_muts.csv'
|
||||||
outfile_ambig_muts = outdir + '/' + out_filename_ambig_muts
|
outfile_ambig_muts = outdir + '/' + out_filename_ambig_muts
|
||||||
print('\n----------------------------------'
|
print('\n----------------------------------'
|
||||||
, '\nWriting file: ambiguous muts'
|
, '\nWriting file: ambiguous muts'
|
||||||
, '\n----------------------------------'
|
, '\n----------------------------------'
|
||||||
, '\nFilename:', outfile_ambig_muts)
|
, '\nFilename:', outfile_ambig_muts)
|
||||||
|
inspect = gene_LF1[gene_LF1['mutation'].isin(common_muts)]
|
||||||
|
inspect.to_csv(outfile_ambig_muts, index = False)
|
||||||
|
|
||||||
inspect = gene_LF1[gene_LF1['mutation'].isin(common_muts)]
|
print('Finished writing:', out_filename_ambig_muts
|
||||||
inspect.to_csv(outfile_ambig_muts, index = False)
|
|
||||||
|
|
||||||
print('Finished writing:', out_filename_ambig_muts
|
|
||||||
, '\nNo. of rows:', len(inspect)
|
, '\nNo. of rows:', len(inspect)
|
||||||
, '\nNo. of cols:', len(inspect.columns)
|
, '\nNo. of cols:', len(inspect.columns)
|
||||||
, '\nNo. of rows = no. of samples with the ambiguous muts present:'
|
, '\nNo. of rows = no. of samples with the ambiguous muts present:'
|
||||||
, dr_muts.isin(other_muts).sum() + other_muts.isin(dr_muts).sum()
|
, dr_muts.isin(other_muts).sum() + other_muts.isin(dr_muts).sum()
|
||||||
, '\n=============================================================')
|
, '\n=============================================================')
|
||||||
|
del(out_filename_ambig_muts)
|
||||||
|
|
||||||
del(out_filename_ambig_muts)
|
|
||||||
#%% end of data extraction and some files writing. Below are some more files writing.
|
#%% end of data extraction and some files writing. Below are some more files writing.
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
#%% Formatting df: read aa dict and pull relevant info
|
#%% Formatting df: read aa dict and pull relevant info
|
||||||
|
@ -1181,7 +1193,7 @@ if snps_only.mutationinformation.isna().sum() == 0:
|
||||||
else:
|
else:
|
||||||
sys.exit('FAIL: SNP has NA, Possible mapping issues from dict?')
|
sys.exit('FAIL: SNP has NA, Possible mapping issues from dict?')
|
||||||
|
|
||||||
out_filename_mcsmsnps = gene.lower() + '_mcsm_snps.csv'
|
out_filename_mcsmsnps = gene.lower() + '_mcsm_style_snps.csv'
|
||||||
outfile_mcsmsnps = outdir + '/' + out_filename_mcsmsnps
|
outfile_mcsmsnps = outdir + '/' + out_filename_mcsmsnps
|
||||||
|
|
||||||
print('\n----------------------------------'
|
print('\n----------------------------------'
|
||||||
|
@ -1215,7 +1227,7 @@ metadata_pos.sort_values(by = ['meta_pos_count'], ascending = False, inplace = T
|
||||||
out_filename_metadata_poscounts = gene.lower() + '_metadata_poscounts.csv'
|
out_filename_metadata_poscounts = gene.lower() + '_metadata_poscounts.csv'
|
||||||
outfile_metadata_poscounts = outdir + '/' + out_filename_metadata_poscounts
|
outfile_metadata_poscounts = outdir + '/' + out_filename_metadata_poscounts
|
||||||
print('\n----------------------------------'
|
print('\n----------------------------------'
|
||||||
, 'Writing file: Metadata poscounts'
|
, '\nWriting file: Metadata poscounts'
|
||||||
, '\n----------------------------------'
|
, '\n----------------------------------'
|
||||||
, '\nFile:', outfile_metadata_poscounts
|
, '\nFile:', outfile_metadata_poscounts
|
||||||
, '\n============================================================')
|
, '\n============================================================')
|
||||||
|
@ -1309,7 +1321,7 @@ out_filename_pos = gene.lower() + '_mutational_positons.csv'
|
||||||
outfile_pos = outdir + '/' + out_filename_pos
|
outfile_pos = outdir + '/' + out_filename_pos
|
||||||
|
|
||||||
print('\n----------------------------------'
|
print('\n----------------------------------'
|
||||||
, 'Writing file: mutational positions'
|
, '\nWriting file: mutational positions'
|
||||||
, '\n----------------------------------'
|
, '\n----------------------------------'
|
||||||
, '\nFile:', outfile_pos
|
, '\nFile:', outfile_pos
|
||||||
, '\nNo. of distinct positions:', len(pos_only_sorted)
|
, '\nNo. of distinct positions:', len(pos_only_sorted)
|
||||||
|
@ -1349,15 +1361,14 @@ print('============================================'
|
||||||
, '\nTotal no. of samples with missense muts:', len(gene_LF1)
|
, '\nTotal no. of samples with missense muts:', len(gene_LF1)
|
||||||
, '\nTotal no. of unique samples with missense muts:', gene_LF1['id'].nunique()
|
, '\nTotal no. of unique samples with missense muts:', gene_LF1['id'].nunique()
|
||||||
, '\n'
|
, '\n'
|
||||||
, '\nTotal no.of samples with common_ids:', nu_common_ids['id']
|
, '\nTotal no.of samples with common_ids:', nu_common_ids['id'])
|
||||||
, '\nTotal no.of samples with ambiguous muts:', len(inspect)
|
|
||||||
|
if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
|
||||||
|
print('\nTotal no.of samples with ambiguous muts:', len(inspect)
|
||||||
#, '\nTotal no.of unique ambiguous muts:', len(common_muts)
|
#, '\nTotal no.of unique ambiguous muts:', len(common_muts)
|
||||||
, '\nTotal no.of unique ambiguous muts:', inspect['mutation'].nunique()
|
, '\nTotal no.of unique ambiguous muts:', inspect['mutation'].nunique()
|
||||||
, '\n============================================================='
|
, '\n============================================================='
|
||||||
, '\n\n\n')
|
, '\n\n\n')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#=======================================================================
|
#=======================================================================
|
||||||
print(u'\u2698' * 50,
|
print(u'\u2698' * 50,
|
||||||
'\nEnd of script: Data extraction and writing files'
|
'\nEnd of script: Data extraction and writing files'
|
||||||
|
|
|
@ -1,32 +1,42 @@
|
||||||
#========
|
#========
|
||||||
# data extraction: Must be run first to extract mutations for each drug-gene combination
|
# data extraction: Must be run first to extract mutations for each drug-gene combination
|
||||||
#========
|
#========
|
||||||
./data_extraction.py -d pyrazinamide -g pncA
|
./data_extraction.py -d <drug> -g <gene> --make_dirs
|
||||||
|
|
||||||
|
#========
|
||||||
|
# add chains to a PDB file: for modeller models lack chain ID, this script is used
|
||||||
|
#========
|
||||||
|
add_chains_pdb.py <N> MY_PDB.pdb
|
||||||
|
|
||||||
|
#========
|
||||||
|
# pdb data extraction: To find out discontinuity of chain and removing invalid muts to allow foldx and mcsm to run properly!
|
||||||
|
#========
|
||||||
|
In progress...
|
||||||
|
|
||||||
#========
|
#========
|
||||||
# foldx: specify chain if default is NOT 'A'
|
# foldx: specify chain if default is NOT 'A'
|
||||||
#========
|
#========
|
||||||
./runFoldx.py -d pyrazinamide -g pncA
|
./runFoldx.py -d <drug> -g <gene> -c1 A -p /media/tanu/eb1d072a-3f73-427f-aeb8-f6852b5c5216/Data/processing
|
||||||
|
|
||||||
#========
|
#========
|
||||||
# mcsm: specify chain if default is NOT 'A'
|
# mcsm: specify chain if default is NOT 'A'
|
||||||
#========
|
#========
|
||||||
./run_mcsm.py -d pyrazinamide -g pncA -s submit -l PZA --debug
|
./run_mcsm.py -d <drug> -g <gene> -s submit -l PZA --debug
|
||||||
./run_mcsm.py -d pyrazinamide -g pncA -s get
|
./run_mcsm.py -d <drug> -g <gene> pncA -s get
|
||||||
./run_mcsm.py -d pyrazinamide -g pncA -s format
|
./run_mcsm.py -d <drug> -g <gene> pncA -s format
|
||||||
|
|
||||||
#====================
|
#====================
|
||||||
# other struct params
|
# other struct params
|
||||||
#====================
|
#====================
|
||||||
./dssp_df.py -d pyrazinamide -g pncA
|
./dssp_df.py -d <drug> -g <gene>
|
||||||
# Errors on matplot.lib warn=, so just comment it out for the timebeing!: MONKEY PATCH
|
# Errors on matplot.lib warn=, so just comment it out for the timebeing!: MONKEY PATCH
|
||||||
./kd_df.py -d pyrazinamide -g pncA -fasta # fixme: NO of cols says 2, but is actually 3
|
./kd_df.py -d <drug> -g <gene> -fasta # fixme: NO of cols says 2, but is actually 3
|
||||||
./rd_df.py -d pyrazinamide -g pncA # fixme: input tsv file is sourced manually from website!
|
./rd_df.py -d <drug> -g <gene> # fixme: input tsv file is sourced manually from website!
|
||||||
|
|
||||||
#==============================
|
#==============================
|
||||||
# af_or calcs: different types
|
# af_or calcs: different types
|
||||||
#==============================
|
#==============================
|
||||||
./af_or_calcs.R --d pyrazinamide --gene pncA # fixme: No conditional dir structure
|
./af_or_calcs.R -d <drug> -g <gene># fixme: No conditional dir structure
|
||||||
|
|
||||||
#==============================
|
#==============================
|
||||||
# af_or calcs: kinship
|
# af_or calcs: kinship
|
||||||
|
@ -40,18 +50,18 @@ USE THE BELOW from within the or_kinship_link.py script or something?! as part o
|
||||||
# for now use the file already created using some manual wrestling to link
|
# for now use the file already created using some manual wrestling to link
|
||||||
# the OR for kinship with mutations
|
# the OR for kinship with mutations
|
||||||
|
|
||||||
./or_kinship_link.py -d pyrazinamide -g pncA -sc 2288681 -ec 2289241
|
./or_kinship_link.py -d <drug> -g <gene> -sc <start_coord> -ec <end_coord>
|
||||||
|
|
||||||
#==============================
|
#==============================
|
||||||
# formatting: ns<gene>_snp_info.txt
|
# formatting: ns<gene>_snp_info.txt
|
||||||
#==============================
|
#==============================
|
||||||
# This adds mcsm style muts
|
# This adds mcsm style muts
|
||||||
./snpinfo_format.py -d pyrazinamide -g pncA
|
./snpinfo_format.py -d <drug> -g <gene>
|
||||||
|
|
||||||
#==============================
|
#==============================
|
||||||
# combining dfs: combining_dfs.py
|
# combining dfs: combining_dfs.py
|
||||||
#==============================
|
#==============================
|
||||||
# FIXME: combining_FIXME.py
|
# FIXME: combining_FIXME.py
|
||||||
./combining_dfs.py -d pyrazinamide -g pncA
|
./combining_dfs.py --d <drug> -g <gene>
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue