adding clean files for rerrun 35k dataset

This commit is contained in:
Tanushree Tunstall 2020-07-07 18:28:55 +01:00
parent 943513a338
commit a7f21cfb14
32 changed files with 157 additions and 44550 deletions

View file

@ -21,6 +21,9 @@ Created on Tue Aug 6 12:56:03 2019
# where each row is a separate mutation
# sample ids AND mutations are NOT unique, but the COMBINATION (sample id + mutation) = unique
# NOTE
#drtype is renamed to 'resistance' in the 35k dataset
# output files: all lower case
# 0) <gene>_common_ids.csv
# 1) <gene>_ambiguous_muts.csv
@ -60,6 +63,7 @@ os.getcwd()
# import aa dict
from reference_dict import my_aa_dict # CHECK DIR STRUC THERE!
from tidy_split import tidy_split
#=======================================================================
#%% command line args
arg_parser = argparse.ArgumentParser()
@ -96,8 +100,8 @@ datadir = homedir + '/' + 'git/Data'
#=======
# input
#=======
in_filename = 'original_tanushree_data_v2.csv'
#in_filename = 'mtb_gwas_v3.csv'
#in_filename = 'original_tanushree_data_v2.csv' #19k
in_filename = 'mtb_gwas_meta_v3.csv' #33k
infile = datadir + '/' + in_filename
print('Input file: ', infile
, '\n============================================================')
@ -121,17 +125,45 @@ master_data = pd.read_csv(infile, sep = ',')
#list(master_data.columns)
# extract elevant columns to extract from meta data related to the drug
meta_data = master_data[['id'
,'country'
,'lineage'
,'sublineage'
,'drtype'
, drug
, dr_muts_col
, other_muts_col
]]
del(master_data)
#meta_data_ch = master_data[['id'
#, 'country'
#, 'lineage'
#, 'sublineage'
##, 'drtype' #19k only
#, 'resistance'
#, drug
#, dr_muts_col
#, other_muts_col]]
core_cols = ['id'
, 'country'
, 'country2'
, 'geographic_source'
, 'region'
, 'date'
, 'strain'
, 'lineage'
, 'sublineage' #drtype renamed to resistance
, 'resistance'
, 'location'
, 'host_body_site'
, 'environment_material'
, 'host_status'
, 'hiv_status'
, 'HIV_status'
, 'isolation_source']
variable_based_cols = [drug
, dr_muts_col
, other_muts_col]
cols_to_extract = core_cols + variable_based_cols
meta_data = master_data[cols_to_extract]
del(master_data, variable_based_cols, cols_to_extract)
# checks and results
total_samples = meta_data['id'].nunique()
@ -269,14 +301,23 @@ print('gene to extract:', gene_match )
#===============
# FIXME: replace drug with variable containing the drug name
# !!! important !!!
meta_data_dr = meta_data[['id'
,'country'
,'lineage'
,'sublineage'
,'drtype'
, drug
, dr_muts_col
]]
#meta_data_dr = meta_data[['id'
# ,'country'
# ,'lineage'
# ,'sublineage'
# ,'drtype'
# , drug
# , dr_muts_col
# ]]
dr_based_cols = [drug, dr_muts_col]
cols_to_extract = core_cols + dr_based_cols
meta_data_dr = meta_data[cols_to_extract]
del(dr_based_cols, cols_to_extract)
print('expected dim should be:', len(meta_data), (len(meta_data.columns)-1) )
print('actual dim:', meta_data_dr.shape
, '\n===============================================================')
@ -306,14 +347,22 @@ dr_id = pd.Series(dr_id)
print('Extracting dr_muts from:', other_muts_col,'with other meta_data')
# FIXME: replace drug with variable containing the drug name
# !!! important !!!
meta_data_other = meta_data[['id'
,'country'
,'lineage'
,'sublineage'
,'drtype'
, drug
, other_muts_col
]]
#meta_data_other = meta_data[['id'
# ,'country'
# ,'lineage'
# ,'sublineage'
## ,'drtype'
# , drug
# , other_muts_col
# ]]
dr_based_cols = [drug, other_muts_col]
cols_to_extract = core_cols + dr_based_cols
meta_data_other = meta_data[cols_to_extract]
del(dr_based_cols, cols_to_extract)
print('expected dim should be:', len(meta_data), (len(meta_data.columns)-1) )
print('actual dim:', meta_data_other.shape
@ -373,7 +422,7 @@ print('Writing file:'
, '\nExpected no. of rows:', len(common_ids)
, '\n=============================================================')
common_ids.to_csv(outfile0)
common_ids.to_csv(outfile0, index = False)
del(out_filename0)
# clear variables
@ -419,44 +468,15 @@ print('This is still dirty data: samples have ', gene_match, 'muts but may have
#https://stackoverflow.com/questions/41476150/removing-space-from-dataframe-columns-in-pandas
print('Performing tidy_split(): to separate the mutations into indivdual rows')
# define the split function
def tidy_split(df, column, sep='|', keep=False):
'''
Split the values of a column and expand so the new DataFrame has one split
value per row. Filters rows where the column is missing.
Params
------
df : pandas.DataFrame
dataframe with the column to split and expand
column : str
the column to split and expand
sep : str
the string used to split the column's values
keep : bool
whether to retain the presplit value as it's own row
Returns
-------
pandas.DataFrame
Returns a dataframe with the same columns as `df`.
'''
indexes = list()
new_values = list()
#df = df.dropna(subset=[column])#!!!!-----see this incase you need to uncomment based on use case
for i, presplit in enumerate(df[column].astype(str)):
values = presplit.split(sep)
if keep and len(values) > 1:
indexes.append(i)
new_values.append(presplit)
for value in values:
indexes.append(i)
new_values.append(value)
new_df = df.iloc[indexes, :].copy()
new_df[column] = new_values
return new_df
#%% end of tidy_split()
#TIDY SPLIT HERE
#=========
# DF1: dr_muts_col
#=========
@ -761,12 +781,11 @@ del(c1, c2, col_to_split1, col_to_split2, comp_gene_samples, dr_WF0, dr_df, dr_m
out_filename1 = gene.lower() + '_ambiguous_muts.csv'
outfile1 = outdir + '/' + out_filename1
print('Writing file: ambiguous muts'
, '\nFilename:', out_filename1
, '\nPath:', outdir)
, '\nFilename:', outfile1)
#common_muts = ['gene_matchVal180Phe','gene_matchGln10Pro'] # test
inspect = gene_LF1[gene_LF1['mutation'].isin(common_muts)]
inspect.to_csv(outfile1)
inspect.to_csv(outfile1, index = False)
print('Finished writing:', out_filename1
, '\nNo. of rows:', len(inspect)
@ -1069,13 +1088,13 @@ else:
print('FAIL: SNP has NA, Possible mapping issues from dict?'
, '\nDebug please!'
, '\n=========================================================')
sys.exit()
out_filename2 = gene.lower() + '_mcsm_snps.csv'
outfile2 = outdir + '/' + out_filename2
print('Writing file: mCSM style muts'
, '\nFilename:', out_filename2
, '\nPath:', outdir
, '\nFilename:', outfile2
, '\nmutation format (SNP): {WT}<POS>{MUT}'
, '\nNo. of distinct muts:', len(snps_only)
, '\nNo. of distinct positions:', len(pos_only)
@ -1083,7 +1102,7 @@ print('Writing file: mCSM style muts'
snps_only.to_csv(outfile2, header = False, index = False)
print('Finished writing:', out_filename2
print('Finished writing:', outfile2
, '\nNo. of rows:', len(snps_only)
, '\nNo. of cols:', len(snps_only.columns)
, '\n=============================================================')
@ -1099,7 +1118,7 @@ print('Writing file: LF formatted data'
, '\n============================================================')
gene_LF1.to_csv(outfile3, header = True, index = False)
print('Finished writing:', out_filename3
print('Finished writing:', outfile3
, '\nNo. of rows:', len(gene_LF1)
, '\nNo. of cols:', len(gene_LF1.columns)
, '\n=============================================================')
@ -1118,11 +1137,11 @@ all_muts_msa.columns.dtype
all_muts_msa_sorted = all_muts_msa.sort_values(by = 'mutationinformation')
# create an extra column with protein name
all_muts_msa_sorted = all_muts_msa_sorted.assign(fasta_name = '3PL1')
all_muts_msa_sorted.head()
#all_muts_msa_sorted = all_muts_msa_sorted.assign(fasta_name = '3PL1')
#all_muts_msa_sorted.head()
# rearrange columns so the fasta name is the first column (required for mutate.script)
all_muts_msa_sorted = all_muts_msa_sorted[['fasta_name', 'mutationinformation']]
#all_muts_msa_sorted = all_muts_msa_sorted[['fasta_name', 'mutationinformation']]
all_muts_msa_sorted.head()
print('Checking NA in snps...')# should be 0
@ -1138,15 +1157,14 @@ out_filename4 = gene.lower() +'_all_muts_msa.csv'
outfile4 = outdir + '/' + out_filename4
print('Writing file: mCSM style muts for msa',
'\nFilename:', out_filename4,
'\nPath:', outdir,
'\nFilename:', outfile4,
'\nmutation format (SNP): {WT}<POS>{MUT}',
'\nNo.of lines of msa:', len(all_muts_msa),
)
all_muts_msa_sorted.to_csv(outfile4, header = False, index = False)
print('Finished writing:', out_filename4
print('Finished writing:', outfile4
, '\nNo. of rows:', len(all_muts_msa)
, '\nNo. of cols:', len(all_muts_msa.columns)
, '\n=============================================================')
@ -1177,7 +1195,7 @@ print('Writing file: mutational positions'
pos_only_sorted.to_csv(outfile5, header = True, index = False)
print('Finished writing:', out_filename5
print('Finished writing:', outfile5
, '\nNo. of rows:', len(pos_only_sorted)
, '\nNo. of cols:', len(pos_only_sorted.columns)
, '\n=============================================================')