added output file for checking
This commit is contained in:
parent
dcd9a985ec
commit
2d8cb01cb7
2 changed files with 38 additions and 352 deletions
|
@ -52,7 +52,7 @@ Created on Tue Aug 6 12:56:03 2019
|
||||||
import os, sys
|
import os, sys
|
||||||
import re
|
import re
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
#import numpy as np
|
import numpy as np
|
||||||
import argparse
|
import argparse
|
||||||
#=======================================================================
|
#=======================================================================
|
||||||
#%% homdir and curr dir and local imports
|
#%% homdir and curr dir and local imports
|
||||||
|
@ -68,18 +68,17 @@ from tidy_split import tidy_split
|
||||||
#=======================================================================
|
#=======================================================================
|
||||||
#%% command line args
|
#%% command line args
|
||||||
arg_parser = argparse.ArgumentParser()
|
arg_parser = argparse.ArgumentParser()
|
||||||
arg_parser.add_argument('-d', '--drug', help='drug name', default = None)
|
arg_parser.add_argument('-d', '--drug', help='drug name (case sensitive)', default = None)
|
||||||
arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = None)
|
arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = None)
|
||||||
|
|
||||||
|
|
||||||
args = arg_parser.parse_args()
|
args = arg_parser.parse_args()
|
||||||
#=======================================================================
|
#=======================================================================
|
||||||
#%% variable assignment: input and output paths & filenames
|
#%% variable assignment: input and output paths & filenames
|
||||||
#drug = args.drug
|
drug = args.drug
|
||||||
#gene = args.gene
|
gene = args.gene
|
||||||
|
|
||||||
drug = 'pyrazinamide'
|
#drug = 'pyrazinamide'
|
||||||
gene = 'pncA'
|
#gene = 'pncA'
|
||||||
|
|
||||||
gene_match = gene + '_p.'
|
gene_match = gene + '_p.'
|
||||||
print('mut pattern for gene', gene, ':', gene_match)
|
print('mut pattern for gene', gene, ':', gene_match)
|
||||||
|
@ -99,6 +98,7 @@ print('position regex:', pos_regex)
|
||||||
# building cols to extract
|
# building cols to extract
|
||||||
dr_muts_col = 'dr_mutations_' + drug
|
dr_muts_col = 'dr_mutations_' + drug
|
||||||
other_muts_col = 'other_mutations_' + drug
|
other_muts_col = 'other_mutations_' + drug
|
||||||
|
resistance_col = 'drtype'
|
||||||
|
|
||||||
print('Extracting columns based on variables:\n'
|
print('Extracting columns based on variables:\n'
|
||||||
, drug
|
, drug
|
||||||
|
@ -106,6 +106,8 @@ print('Extracting columns based on variables:\n'
|
||||||
, dr_muts_col
|
, dr_muts_col
|
||||||
, '\n'
|
, '\n'
|
||||||
, other_muts_col
|
, other_muts_col
|
||||||
|
, '\n'
|
||||||
|
, resistance_col
|
||||||
, '\n===============================================================')
|
, '\n===============================================================')
|
||||||
#=======================================================================
|
#=======================================================================
|
||||||
#%% input and output dirs and files
|
#%% input and output dirs and files
|
||||||
|
@ -120,7 +122,7 @@ outdir = datadir + '/' + drug + '/' + 'output'
|
||||||
# input
|
# input
|
||||||
#=======
|
#=======
|
||||||
#in_filename_master_master = 'original_tanushree_data_v2.csv' #19k
|
#in_filename_master_master = 'original_tanushree_data_v2.csv' #19k
|
||||||
in_filename_master = 'mtb_gwas_meta_v3.csv' #33k
|
in_filename_master = 'mtb_gwas_meta_v5.csv' #34k
|
||||||
infile_master = datadir + '/' + in_filename_master
|
infile_master = datadir + '/' + in_filename_master
|
||||||
print('Input file: ', infile_master
|
print('Input file: ', infile_master
|
||||||
, '\n============================================================')
|
, '\n============================================================')
|
||||||
|
@ -147,33 +149,37 @@ if in_filename_master == 'original_tanushree_data_v2.csv':
|
||||||
, 'country'
|
, 'country'
|
||||||
, 'lineage'
|
, 'lineage'
|
||||||
, 'sublineage'
|
, 'sublineage'
|
||||||
, 'drtype' #19k only
|
, 'drtype'
|
||||||
, drug
|
, drug
|
||||||
, dr_muts_col
|
, dr_muts_col
|
||||||
, other_muts_col]]
|
, other_muts_col]]
|
||||||
|
|
||||||
if in_filename_master == 'mtb_gwas_meta_v3.csv':
|
if in_filename_master == 'mtb_gwas_meta_v5.csv':
|
||||||
core_cols = ['id'
|
core_cols = ['id'
|
||||||
, 'country'
|
, 'sample'
|
||||||
, 'country2'
|
, 'patient_id'
|
||||||
, 'geographic_source'
|
|
||||||
, 'region'
|
|
||||||
, 'date'
|
|
||||||
, 'strain'
|
, 'strain'
|
||||||
, 'lineage'
|
, 'lineage'
|
||||||
, 'sublineage' #drtype renamed to resistance
|
, 'sublineage'
|
||||||
, 'resistance'
|
, 'country'
|
||||||
|
, 'country_code'
|
||||||
|
, 'geographic_source'
|
||||||
|
#, 'region'
|
||||||
, 'location'
|
, 'location'
|
||||||
, 'host_body_site'
|
, 'host_body_site'
|
||||||
, 'environment_material'
|
, 'environment_material'
|
||||||
, 'host_status'
|
, 'host_status'
|
||||||
|
, 'host_sex'
|
||||||
|
, 'submitted_host_sex'
|
||||||
, 'hiv_status'
|
, 'hiv_status'
|
||||||
, 'HIV_status'
|
, 'HIV_status'
|
||||||
|
, 'tissue_type'
|
||||||
, 'isolation_source']
|
, 'isolation_source']
|
||||||
|
|
||||||
variable_based_cols = [drug
|
variable_based_cols = [drug
|
||||||
, dr_muts_col
|
, dr_muts_col
|
||||||
, other_muts_col]
|
, other_muts_col
|
||||||
|
, resistance_col]
|
||||||
|
|
||||||
cols_to_extract = core_cols + variable_based_cols
|
cols_to_extract = core_cols + variable_based_cols
|
||||||
print('Extracting', len(cols_to_extract), 'columns from master data')
|
print('Extracting', len(cols_to_extract), 'columns from master data')
|
||||||
|
@ -193,7 +199,14 @@ print('RESULT: Total samples:', total_samples
|
||||||
meta_data.isna().sum()
|
meta_data.isna().sum()
|
||||||
print('No. of NAs/column:' + '\n', meta_data.isna().sum()
|
print('No. of NAs/column:' + '\n', meta_data.isna().sum()
|
||||||
, '\n===========================================================')
|
, '\n===========================================================')
|
||||||
#
|
|
||||||
|
#%% Write check file
|
||||||
|
check_file = outdir + '/' + gene.lower() + '_gwas.csv'
|
||||||
|
meta_data.to_csv(check_file)
|
||||||
|
print('Writing subsetted gwas data'
|
||||||
|
, '\nFile', check_file
|
||||||
|
, '\nDim:', meta_data.shape)
|
||||||
|
|
||||||
# glance
|
# glance
|
||||||
#meta_data.head()
|
#meta_data.head()
|
||||||
#total_samples - NA pyrazinamide = ?
|
#total_samples - NA pyrazinamide = ?
|
||||||
|
@ -203,7 +216,10 @@ print('No. of NAs/column:' + '\n', meta_data.isna().sum()
|
||||||
# equivalent of table in R
|
# equivalent of table in R
|
||||||
# drug counts: complete samples for OR calcs
|
# drug counts: complete samples for OR calcs
|
||||||
meta_data[drug].value_counts()
|
meta_data[drug].value_counts()
|
||||||
print('RESULT: Sus and Res samples:\n', meta_data[drug].value_counts()
|
print('===========================================================\n'
|
||||||
|
, 'RESULT: No. of Sus and Res samples:\n', meta_data[drug].value_counts()
|
||||||
|
, '\n===========================================================\n'
|
||||||
|
, 'RESULT: Percentage of Sus and Res samples:\n', meta_data[drug].value_counts(normalize = True)*100
|
||||||
, '\n===========================================================')
|
, '\n===========================================================')
|
||||||
|
|
||||||
#%%
|
#%%
|
||||||
|
@ -306,7 +322,8 @@ print('Predicting total no. of rows in the curated df:', dr_gene_count + other_g
|
||||||
, '\n===================================================================')
|
, '\n===================================================================')
|
||||||
expected_rows = dr_gene_count + other_gene_count
|
expected_rows = dr_gene_count + other_gene_count
|
||||||
|
|
||||||
del(i, id, wt_other, clean_df, na_count, id2_other, count_gene_other, count_wt)
|
#del( wt_other, clean_df, i, id, na_count, id2_other, count_gene_other, count_wt)
|
||||||
|
del(clean_df, na_count, i, id, wt_other, id2_other, count_gene_other,count_wt )
|
||||||
|
|
||||||
#%%
|
#%%
|
||||||
############
|
############
|
||||||
|
|
|
@ -1,331 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
'''
|
|
||||||
Created on Tue Aug 6 12:56:03 2019
|
|
||||||
|
|
||||||
@author: tanu
|
|
||||||
'''
|
|
||||||
|
|
||||||
# FIXME: include error checking to enure you only
|
|
||||||
# concentrate on positions that have structural info?
|
|
||||||
|
|
||||||
# FIXME: import dirs.py to get the basic dir paths available
|
|
||||||
#=======================================================================
|
|
||||||
# TASK:
|
|
||||||
|
|
||||||
|
|
||||||
#=======================================================================
|
|
||||||
#%% load libraries
|
|
||||||
import os, sys
|
|
||||||
import re
|
|
||||||
import pandas as pd
|
|
||||||
import numpy as np
|
|
||||||
import argparse
|
|
||||||
#=======================================================================
|
|
||||||
#%% homdir and curr dir and local imports
|
|
||||||
homedir = os.path.expanduser('~')
|
|
||||||
# set working dir
|
|
||||||
os.getcwd()
|
|
||||||
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
|
|
||||||
os.getcwd()
|
|
||||||
|
|
||||||
# import aa dict
|
|
||||||
#from reference_dict import my_aa_dict # CHECK DIR STRUC THERE!
|
|
||||||
#from tidy_split import tidy_split
|
|
||||||
#=======================================================================
|
|
||||||
#%% command line args
|
|
||||||
arg_parser = argparse.ArgumentParser()
|
|
||||||
arg_parser.add_argument('-d', '--drug', help='drug name', default = None)
|
|
||||||
arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = None)
|
|
||||||
|
|
||||||
|
|
||||||
args = arg_parser.parse_args()
|
|
||||||
#=======================================================================
|
|
||||||
#%% variable assignment: input and output paths & filenames
|
|
||||||
#drug = args.drug
|
|
||||||
#gene = args.gene
|
|
||||||
|
|
||||||
drug = 'pyrazinamide'
|
|
||||||
gene = 'pncA'
|
|
||||||
|
|
||||||
gene_match = gene + '_p.'
|
|
||||||
print('mut pattern for gene', gene, ':', gene_match)
|
|
||||||
|
|
||||||
nssnp_match = gene_match +'[A-Z]{3}[0-9]+[A-Z]{3}'
|
|
||||||
print('nsSNP for gene', gene, ':', nssnp_match)
|
|
||||||
|
|
||||||
wt_regex = gene_match.lower()+'(\w{3})'
|
|
||||||
print('wt regex:', wt_regex)
|
|
||||||
|
|
||||||
mut_regex = r'\d+(\w{3})$'
|
|
||||||
print('mt regex:', mut_regex)
|
|
||||||
|
|
||||||
pos_regex = r'(\d+)'
|
|
||||||
print('position regex:', pos_regex)
|
|
||||||
|
|
||||||
# building cols to extract
|
|
||||||
dr_muts_col = 'dr_mutations_' + drug
|
|
||||||
other_muts_col = 'other_mutations_' + drug
|
|
||||||
dr_type = "resistance"
|
|
||||||
|
|
||||||
|
|
||||||
print('Extracting columns based on variables:\n'
|
|
||||||
, drug
|
|
||||||
, '\n'
|
|
||||||
, dr_type
|
|
||||||
, '\n'
|
|
||||||
, dr_muts_col
|
|
||||||
, '\n'
|
|
||||||
, other_muts_col
|
|
||||||
, '\n===============================================================')
|
|
||||||
#=======================================================================
|
|
||||||
#%% input and output dirs and files
|
|
||||||
#=======
|
|
||||||
# dirs
|
|
||||||
#=======
|
|
||||||
datadir = homedir + '/' + 'git/Data'
|
|
||||||
indir = datadir + '/' + drug + '/' + 'input'
|
|
||||||
outdir = datadir + '/' + drug + '/' + 'output'
|
|
||||||
|
|
||||||
#=======
|
|
||||||
# input
|
|
||||||
#=======
|
|
||||||
#in_filename_master_master = 'original_tanushree_data_v2.csv' #19k
|
|
||||||
in_filename_v2 = 'original_tanushree_data_v2.csv' #19k
|
|
||||||
infile_master_v2 = datadir + '/' + in_filename_v2
|
|
||||||
print('Input file v2: ', infile_master_v2
|
|
||||||
, '\n============================================================')
|
|
||||||
|
|
||||||
in_filename_v3 = 'mtb_gwas_meta_v3.csv' #33k
|
|
||||||
infile_master_v3 = datadir + '/' + in_filename_v3
|
|
||||||
print('Input file v3: ', infile_master_v3
|
|
||||||
, '\n============================================================')
|
|
||||||
|
|
||||||
|
|
||||||
in_filename_v4 = 'mtb_gwas_meta_v4.csv' #34k
|
|
||||||
infile_master_v4 = datadir + '/' + in_filename_v4
|
|
||||||
print('Input file v4: ', infile_master_v4
|
|
||||||
, '\n============================================================')
|
|
||||||
|
|
||||||
in_filename_v5 = 'mtb_gwas_meta_v5.csv' #34k
|
|
||||||
infile_master_v5 = datadir + '/' + in_filename_v5
|
|
||||||
print('Input file v4: ', infile_master_v5
|
|
||||||
, '\n============================================================')
|
|
||||||
|
|
||||||
#=======
|
|
||||||
# output
|
|
||||||
#=======
|
|
||||||
# several output files: in respective sections at the time of outputting files
|
|
||||||
print('Output filename: in the respective sections'
|
|
||||||
, '\nOutput path: ', outdir
|
|
||||||
, '\n=============================================================')
|
|
||||||
|
|
||||||
#%%end of variable assignment for input and output files
|
|
||||||
#=======================================================================
|
|
||||||
#%% Read input file
|
|
||||||
master_data_v2 = pd.read_csv(infile_master_v2, sep = ',', dtype = 'unicode') # ascii
|
|
||||||
master_data_v3 = pd.read_csv(infile_master_v3, sep = ',', dtype = 'unicode')
|
|
||||||
master_data_v4 = pd.read_csv(infile_master_v4, sep = ',', dtype = 'unicode')
|
|
||||||
master_data_v5 = pd.read_csv(infile_master_v5, sep = ',', dtype = 'unicode')
|
|
||||||
#DtypeWarning: Columns (48) have mixed types.Specify dtype option on import or set low_memory=False.
|
|
||||||
# interactivity=interactivity, compiler=compiler, result=result)
|
|
||||||
|
|
||||||
#==========
|
|
||||||
# na_check
|
|
||||||
#==========
|
|
||||||
#==================================================================
|
|
||||||
v2_na = master_data_v2.isna().sum()
|
|
||||||
v2_na.name = "v2_na_count"
|
|
||||||
v2_na = v2_na.to_frame()
|
|
||||||
v2_na['v2_na_percent'] = master_data_v2.isna().mean().round(4)*100
|
|
||||||
|
|
||||||
master_data_v2['drtype'].value_counts()
|
|
||||||
master_data_v2['drtype'].value_counts().sum() == len(master_data_v2)
|
|
||||||
|
|
||||||
v2 = master_data_v2[['id'
|
|
||||||
, 'country'
|
|
||||||
, 'lineage'
|
|
||||||
, 'sublineage'
|
|
||||||
, 'drtype'
|
|
||||||
, drug
|
|
||||||
, dr_muts_col
|
|
||||||
, other_muts_col]]
|
|
||||||
|
|
||||||
v2.isna().sum()
|
|
||||||
|
|
||||||
print('complete samples v2:', v2['id'].nunique() - v2[drug].isna().sum())
|
|
||||||
|
|
||||||
#==================================================================
|
|
||||||
v3_na = master_data_v3.isna().sum()
|
|
||||||
v3_na.name = "v3_na_count"
|
|
||||||
v3_na = v3_na.to_frame()
|
|
||||||
v3_na['v3_na_percent'] = master_data_v3.isna().mean().round(4)*100
|
|
||||||
|
|
||||||
master_data_v3['resistance'].value_counts()
|
|
||||||
master_data_v3['resistance'].value_counts().sum() == len(master_data_v3)
|
|
||||||
|
|
||||||
v3 = master_data_v3[['id'
|
|
||||||
, 'country'
|
|
||||||
, 'lineage'
|
|
||||||
, 'sublineage'
|
|
||||||
, 'resistance'
|
|
||||||
, drug
|
|
||||||
, dr_muts_col
|
|
||||||
, other_muts_col]]
|
|
||||||
|
|
||||||
v3.isna().sum()
|
|
||||||
|
|
||||||
print('complete samples v3:', v3['id'].nunique() - v3[drug].isna().sum())
|
|
||||||
|
|
||||||
#==================================================================
|
|
||||||
v4_na = master_data_v4.isna().sum()
|
|
||||||
v4_na.name = "v4_na_count"
|
|
||||||
v4_na = v4_na.to_frame()
|
|
||||||
v4_na['v4_na_percent'] = master_data_v4.isna().mean().round(4)*100
|
|
||||||
|
|
||||||
v4 = master_data_v4[['id'
|
|
||||||
, 'country'
|
|
||||||
, 'lineage'
|
|
||||||
, 'sublineage'
|
|
||||||
, drug
|
|
||||||
, dr_muts_col
|
|
||||||
, other_muts_col]]
|
|
||||||
|
|
||||||
v4.isna().sum()
|
|
||||||
|
|
||||||
print('complete samples v4:', v4['id'].nunique() - v4[drug].isna().sum())
|
|
||||||
#==================================================================
|
|
||||||
v5_na = master_data_v5.isna().sum()
|
|
||||||
v5_na.name = "v5_na_count"
|
|
||||||
v5_na = v5_na.to_frame()
|
|
||||||
v5_na['v4_na_percent'] = master_data_v5.isna().mean().round(4)*100
|
|
||||||
|
|
||||||
v5 = master_data_v5[['id'
|
|
||||||
, 'country'
|
|
||||||
, 'lineage'
|
|
||||||
, 'sublineage'
|
|
||||||
, drug
|
|
||||||
, dr_muts_col
|
|
||||||
, other_muts_col]]
|
|
||||||
|
|
||||||
v5.isna().sum()
|
|
||||||
|
|
||||||
print('complete samples v5:', v5['id'].nunique() - v5[drug].isna().sum())
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#====================================================================
|
|
||||||
# checking ids
|
|
||||||
id_check1 = master_data_v2['id'].isin(master_data_v3['id']).sum()
|
|
||||||
print('No. of 19k dataset (v1) ids in 33k dataset (v2):',id_check1)
|
|
||||||
|
|
||||||
id_check2 = master_data_v2['id'].isin(master_data_v4['id']).sum()
|
|
||||||
print('No. of 19k dataset (v1) ids in 34k dataset (v4):',id_check2)
|
|
||||||
id_check3 = master_data_v4['id'].isin(master_data_v2['id']).sum()
|
|
||||||
print('No. of 19k dataset (v1) ids in 34k dataset (v4):',id_check3)
|
|
||||||
|
|
||||||
id_check4 = master_data_v3['sample_accession'].isin(master_data_v4['sample_accession']).sum()
|
|
||||||
print('No. of 33k dataset (v3) ids in 34k dataset (v3):',id_check4)
|
|
||||||
id_check5 = master_data_v4['sample_accession'].isin(master_data_v3['sample_accession']).sum()
|
|
||||||
print('No. of 34k dataset (v4) ids in 33k dataset (v3):', id_check5)
|
|
||||||
|
|
||||||
master_data_v3['sample_accession'].equals(master_data_v3['accession'])
|
|
||||||
master_data_v3['sample_accession'].isin(master_data_v3['accession']).sum()
|
|
||||||
master_data_v3['accession'].isin(master_data_v3['sample_accession']).sum()
|
|
||||||
|
|
||||||
|
|
||||||
master_data_v4['sample_accession'].equals(master_data_v4['accession'])
|
|
||||||
master_data_v4['sample_accession'].isin(master_data_v4['accession']).sum()
|
|
||||||
master_data_v4['accession'].isin(master_data_v4['sample_accession']).sum()
|
|
||||||
|
|
||||||
#===================================================================
|
|
||||||
|
|
||||||
|
|
||||||
#====================================================================
|
|
||||||
#which v3 cols are NOT IN V4
|
|
||||||
master_data_v3.columns[~master_data_v3.columns.isin(master_data_v4.columns)]
|
|
||||||
|
|
||||||
# which v4 cols ARE NOT in v3
|
|
||||||
master_data_v4.columns[~master_data_v4.columns.isin(master_data_v3.columns)]
|
|
||||||
|
|
||||||
|
|
||||||
# job: I need resistance and region in v4 data from v3
|
|
||||||
# find mergig cols
|
|
||||||
np.intersect1d(master_data_v3.columns, master_data_v4.columns)
|
|
||||||
|
|
||||||
|
|
||||||
master_data_v3['id'].nunique() == len(master_data_v3)
|
|
||||||
master_data_v3['sample_accession'].nunique() == len(master_data_v3)
|
|
||||||
master_data_v3['accession'].nunique() == len(master_data_v3)
|
|
||||||
master_data_v3['run_accession'].nunique() == len(master_data_v3)
|
|
||||||
|
|
||||||
|
|
||||||
master_data_v4['id'].nunique() == len(master_data_v4)
|
|
||||||
master_data_v4['sample_accession'].nunique() == len(master_data_v4)
|
|
||||||
master_data_v4['accession'].nunique() == len(master_data_v4)
|
|
||||||
master_data_v4['run_accession'].nunique() == len(master_data_v4)
|
|
||||||
|
|
||||||
c_v4 = master_data_v4[['id', 'sample', 'sample_accession', 'run_accession', 'accession'
|
|
||||||
, 'location', 'country', 'geographic_source', 'country_code']]
|
|
||||||
c_v4.isna().sum()
|
|
||||||
|
|
||||||
c_v4_ids = master_data_v4[['id', 'sample', 'sample_accession', 'run_accession', 'accession']]
|
|
||||||
c_v4_ids.isna().sum()
|
|
||||||
c_v4_ids.eq(c_v4_ids.iloc[:, 0], axis=0)
|
|
||||||
|
|
||||||
c_v3 = master_data_v3[['id', 'sample_accession', 'run_accession','accession'
|
|
||||||
, 'location', 'country', 'geographic_source', 'region']]
|
|
||||||
c_v3.isna().sum()
|
|
||||||
c_v3_ids = master_data_v3[['id', 'sample_accession', 'run_accession', 'accession']]
|
|
||||||
c_v3_ids.isna().sum()
|
|
||||||
c_v3_ids.eq(c_v3_ids.iloc[:, 0], axis=0)
|
|
||||||
# comment: id, sample, sample_accession and run_accession seem to have no na
|
|
||||||
|
|
||||||
master_data_v4[drug].isna().sum()
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#%% Write file: mCSM muts
|
|
||||||
|
|
||||||
|
|
||||||
#%% Write file: gene_metadata (i.e gene_LF1)
|
|
||||||
# where each row has UNIQUE mutations NOT unique sample ids
|
|
||||||
out_filename_metadata = gene.lower() + '_metadata.csv'
|
|
||||||
outfile_metadata = outdir + '/' + out_filename_metadata
|
|
||||||
print('Writing file: LF formatted data'
|
|
||||||
, '\nFile:', outfile_metadata
|
|
||||||
, '\n============================================================')
|
|
||||||
|
|
||||||
gene_LF1.to_csv(outfile_metadata, header = True, index = False)
|
|
||||||
print('Finished writing:', outfile_metadata
|
|
||||||
, '\nNo. of rows:', len(gene_LF1)
|
|
||||||
, '\nNo. of cols:', len(gene_LF1.columns)
|
|
||||||
, '\n=============================================================')
|
|
||||||
del(out_filename_metadata)
|
|
||||||
|
|
||||||
#%% write file: mCSM style but with repitions for MSA and logo plots
|
|
||||||
|
|
||||||
print('Writing file: mCSM style muts for msa',
|
|
||||||
'\nFile:', outfile_msa,
|
|
||||||
'\nmutation format (SNP): {WT}<POS>{MUT}',
|
|
||||||
'\nNo.of lines of msa:', len(all_muts_msa))
|
|
||||||
|
|
||||||
all_muts_msa_sorted.to_csv(outfile_msa, header = False, index = False)
|
|
||||||
|
|
||||||
print('Finished writing:', outfile_msa
|
|
||||||
, '\nNo. of rows:', len(all_muts_msa)
|
|
||||||
, '\nNo. of cols:', len(all_muts_msa.columns)
|
|
||||||
, '\n=============================================================')
|
|
||||||
|
|
||||||
del(out_filename_msa)
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue