Merge branch 'embb_dev'

This commit is contained in:
Tanushree Tunstall 2021-11-19 08:05:46 +00:00
commit 4f52627740
10 changed files with 51 additions and 21 deletions

View file

@ -243,7 +243,7 @@ def format_mcsm_output(mcsm_outputcsv):
mcsm_data = mcsm_data.drop_duplicates(['mutationinformation']) mcsm_data = mcsm_data.drop_duplicates(['mutationinformation'])
print('Dim of data after removing duplicate muts:', mcsm_data.shape print('Dim of data after removing duplicate muts:', mcsm_data.shape
, '\n===========================================================') , '\n===========================================================')
#%%===================================================================== #%%=====================================================================
############# #############
# Create col: duet_outcome # Create col: duet_outcome
############# #############

View file

@ -79,13 +79,13 @@ gene_match = gene + '_p.'
# directories # directories
#============ #============
if not datadir: if not datadir:
datadir = homedir + '/' + 'git/Data' datadir = homedir + '/git/Data/'
if not indir: if not indir:
indir = datadir + '/' + drug + '/input' indir = datadir + drug + 'input/'
if not outdir: if not outdir:
outdir = datadir + '/' + drug + '/output' outdir = datadir + drug + 'output/'
#======= #=======
# input # input
@ -95,7 +95,7 @@ if pdb_filename:
else: else:
in_filename_pdb = gene.lower() + '_complex.pdb' in_filename_pdb = gene.lower() + '_complex.pdb'
infile_pdb = indir + '/' + in_filename_pdb infile_pdb = indir + in_filename_pdb
#in_filename_snps = gene.lower() + '_mcsm_snps.csv' #(outfile_mcsm_snps, from data_extraction.py) #in_filename_snps = gene.lower() + '_mcsm_snps.csv' #(outfile_mcsm_snps, from data_extraction.py)
#infile_snps = outdir + '/' + in_filename_snps #infile_snps = outdir + '/' + in_filename_snps
@ -104,8 +104,8 @@ if mutation_filename:
in_filename_snps = mutation_filename in_filename_snps = mutation_filename
else: else:
in_filename_snps = gene.lower() + '_mcsm_formatted_snps.csv' in_filename_snps = gene.lower() + '_mcsm_formatted_snps.csv'
infile_snps = outdir + '/' + in_filename_snps infile_snps = outdir + in_filename_snps
#======= #=======
# output # output
@ -113,13 +113,13 @@ infile_snps = outdir + '/' + in_filename_snps
# mcsm_results globals # mcsm_results globals
if not result_urls: if not result_urls:
result_urls_filename = gene.lower() + '_result_urls.txt' result_urls_filename = gene.lower() + '_result_urls.txt'
result_urls = outdir + '/' + result_urls_filename result_urls = outdir + result_urls_filename
if DEBUG: if DEBUG:
print('DEBUG: Result URLs:', result_urls) print('DEBUG: Result URLs:', result_urls)
if not mcsm_output: if not mcsm_output:
mcsm_output_filename = gene.lower() + '_mcsm_output.csv' mcsm_output_filename = gene.lower() + '_mcsm_output.csv'
mcsm_output = outdir + '/' + mcsm_output_filename mcsm_output = outdir + mcsm_output_filename
if DEBUG: if DEBUG:
print('DEBUG: mCSM output CSV file:', mcsm_output) print('DEBUG: mCSM output CSV file:', mcsm_output)
@ -127,7 +127,7 @@ if not mcsm_output:
#out_filename_format = gene.lower() + '_mcsm_processed.csv' #out_filename_format = gene.lower() + '_mcsm_processed.csv'
if not outfile_format: if not outfile_format:
out_filename_format = gene.lower() + '_complex_mcsm_norm.csv' out_filename_format = gene.lower() + '_complex_mcsm_norm.csv'
outfile_format = outdir + '/' + out_filename_format outfile_format = outdir + out_filename_format
if DEBUG: if DEBUG:
print('DEBUG: formatted CSV output:', outfile_format) print('DEBUG: formatted CSV output:', outfile_format)
#%%===================================================================== #%%=====================================================================

0
mcsm_na/examples.py Normal file → Executable file
View file

0
mcsm_na/format_results_mcsm_na.py Normal file → Executable file
View file

0
mcsm_na/get_results_mcsm_na.py Normal file → Executable file
View file

9
mcsm_na/run_format_results_mcsm_na.py Normal file → Executable file
View file

@ -52,15 +52,16 @@ if not outdir:
outdir_na = outdir + 'mcsm_na_results/' outdir_na = outdir + 'mcsm_na_results/'
# Input file # Input file
infile_mcsm_na = outdir_na + gene + '_output_combined_clean.tsv' infile_mcsm_na = outdir_na + gene.lower() + '_output_combined_clean.tsv'
# Formatted output file # Formatted output file
outfile_mcsm_na_f = outdir_na + gene + '_complex_mcsm_na_norm.csv' outfile_mcsm_na_f = outdir_na + gene.lower() + '_complex_mcsm_na_norm.csv'
#========================== #===========================================
# CALL: format_results_mcsm_na() # CALL: format_results_mcsm_na()
# Data: gid+streptomycin # Data: gid+streptomycin
#========================== # Data: rpob+rifampicin, date: 18/11/2021
#===========================================
print('Formatting results for:', infile_mcsm_na) print('Formatting results for:', infile_mcsm_na)
mcsm_na_df_f = format_mcsm_na_output(mcsm_na_output_tsv = infile_mcsm_na) mcsm_na_df_f = format_mcsm_na_output(mcsm_na_output_tsv = infile_mcsm_na)

View file

@ -18,14 +18,14 @@ print(my_prediction_url)
# TODO: add cmd line args # TODO: add cmd line args
#gene = 'gid' #gene = 'gid'
drug = 'streptomycin' drug = ''
datadir = homedir + '/git/Data' datadir = homedir + '/git/Data/'
indir = datadir + '/' + drug + '/input' indir = datadir + drug + 'input/'
outdir = datadir + '/' + drug + '/output' outdir = datadir + drug + 'output/'
outdir_mcsm_na = outdir + 'mcsm_na_results' outdir_mcsm_na = outdir + 'mcsm_na_results/'
my_nuc_type = 'RNA' my_nuc_type = 'RNA'
my_pdb_file = indir + '/gid_complex.pdb' my_pdb_file = indir + gene.lower() + '_complex.pdb'
#============================================================================= #=============================================================================
# batch 26: 25.txt # RAN: 16 Feb: # batch 26: 25.txt # RAN: 16 Feb:

27
mcsm_na/split_csv.sh Executable file
View file

@ -0,0 +1,27 @@
#!/bin/bash
# FIXME: This is written for expediency to kickstart running dynamut and mcsm-NA
# Usage: ~/git/LSHTM_analysis/dynamut/split_csv.sh <input file> <output dir> <chunk size in lines>
# copy your snp file to split into the mcsm_na dir
INFILE=$1
OUTDIR=$2
CHUNK=$3
mkdir -p ${OUTDIR}/${CHUNK}
cd ${OUTDIR}/${CHUNK}
split ../../${INFILE} -l ${CHUNK} -d snp_batch_
# use case
#~/git/LSHTM_analysis/mcsm_na/split_csv.sh gid_mcsm_formatted_snps.csv snp_batches 50
#~/git/LSHTM_analysis/mcsm_na/split_csv.sh embb_mcsm_formatted_snps.csv snp_batches 50
#~/git/LSHTM_analysis/mcsm_na/split_csv.sh rpob_mcsm_formatted_snps_chain.csv snp_batches 20 # date: 17/11/2021
#acccidently replaced file original rpob batches
#~/git/LSHTM_analysis/mcsm_na/split_csv.sh 5uhc_mcsm_formatted_snps_chain.csv snp_batches_5uhc 20 # date: 17/11/2021

0
mcsm_na/submit_mcsm_na.py Normal file → Executable file
View file

View file

@ -122,7 +122,9 @@ if gene.lower() == "gid":
in_filename_mcsm = gene.lower() + '_complex_mcsm_norm_SRY.csv' # was incorrectly SAM previously in_filename_mcsm = gene.lower() + '_complex_mcsm_norm_SRY.csv' # was incorrectly SAM previously
if gene.lower() == "embb": if gene.lower() == "embb":
print("\nReading mCSM file for gene:", gene) print("\nReading mCSM file for gene:", gene)
in_filename_mcsm = gene.lower() + '_complex_mcsm_norm1.csv' #in_filename_mcsm = gene.lower() + '_complex_mcsm_norm1.csv' #798
in_filename_mcsm = gene.lower() + '_complex_mcsm_norm2.csv' #844
if gene.lower() in gene_list_normal: if gene.lower() in gene_list_normal:
print("\nReading mCSM file for gene:", gene) print("\nReading mCSM file for gene:", gene)
in_filename_mcsm = gene.lower() + '_complex_mcsm_norm.csv' in_filename_mcsm = gene.lower() + '_complex_mcsm_norm.csv'