adding clean files for rerrun 35k dataset
This commit is contained in:
parent
0973717287
commit
8f460347b4
32 changed files with 157 additions and 44550 deletions
|
@ -21,6 +21,9 @@ Created on Tue Aug 6 12:56:03 2019
|
|||
# where each row is a separate mutation
|
||||
# sample ids AND mutations are NOT unique, but the COMBINATION (sample id + mutation) = unique
|
||||
|
||||
# NOTE
|
||||
#drtype is renamed to 'resistance' in the 35k dataset
|
||||
|
||||
# output files: all lower case
|
||||
# 0) <gene>_common_ids.csv
|
||||
# 1) <gene>_ambiguous_muts.csv
|
||||
|
@ -60,6 +63,7 @@ os.getcwd()
|
|||
|
||||
# import aa dict
|
||||
from reference_dict import my_aa_dict # CHECK DIR STRUC THERE!
|
||||
from tidy_split import tidy_split
|
||||
#=======================================================================
|
||||
#%% command line args
|
||||
arg_parser = argparse.ArgumentParser()
|
||||
|
@ -96,8 +100,8 @@ datadir = homedir + '/' + 'git/Data'
|
|||
#=======
|
||||
# input
|
||||
#=======
|
||||
in_filename = 'original_tanushree_data_v2.csv'
|
||||
#in_filename = 'mtb_gwas_v3.csv'
|
||||
#in_filename = 'original_tanushree_data_v2.csv' #19k
|
||||
in_filename = 'mtb_gwas_meta_v3.csv' #33k
|
||||
infile = datadir + '/' + in_filename
|
||||
print('Input file: ', infile
|
||||
, '\n============================================================')
|
||||
|
@ -121,17 +125,45 @@ master_data = pd.read_csv(infile, sep = ',')
|
|||
#list(master_data.columns)
|
||||
|
||||
# extract elevant columns to extract from meta data related to the drug
|
||||
meta_data = master_data[['id'
|
||||
,'country'
|
||||
,'lineage'
|
||||
,'sublineage'
|
||||
,'drtype'
|
||||
, drug
|
||||
, dr_muts_col
|
||||
, other_muts_col
|
||||
]]
|
||||
|
||||
del(master_data)
|
||||
#meta_data_ch = master_data[['id'
|
||||
#, 'country'
|
||||
#, 'lineage'
|
||||
#, 'sublineage'
|
||||
##, 'drtype' #19k only
|
||||
#, 'resistance'
|
||||
#, drug
|
||||
#, dr_muts_col
|
||||
#, other_muts_col]]
|
||||
|
||||
|
||||
core_cols = ['id'
|
||||
, 'country'
|
||||
, 'country2'
|
||||
, 'geographic_source'
|
||||
, 'region'
|
||||
, 'date'
|
||||
, 'strain'
|
||||
, 'lineage'
|
||||
, 'sublineage' #drtype renamed to resistance
|
||||
, 'resistance'
|
||||
, 'location'
|
||||
, 'host_body_site'
|
||||
, 'environment_material'
|
||||
, 'host_status'
|
||||
, 'hiv_status'
|
||||
, 'HIV_status'
|
||||
, 'isolation_source']
|
||||
|
||||
variable_based_cols = [drug
|
||||
, dr_muts_col
|
||||
, other_muts_col]
|
||||
|
||||
cols_to_extract = core_cols + variable_based_cols
|
||||
|
||||
meta_data = master_data[cols_to_extract]
|
||||
|
||||
del(master_data, variable_based_cols, cols_to_extract)
|
||||
|
||||
# checks and results
|
||||
total_samples = meta_data['id'].nunique()
|
||||
|
@ -269,14 +301,23 @@ print('gene to extract:', gene_match )
|
|||
#===============
|
||||
# FIXME: replace drug with variable containing the drug name
|
||||
# !!! important !!!
|
||||
meta_data_dr = meta_data[['id'
|
||||
,'country'
|
||||
,'lineage'
|
||||
,'sublineage'
|
||||
,'drtype'
|
||||
, drug
|
||||
, dr_muts_col
|
||||
]]
|
||||
#meta_data_dr = meta_data[['id'
|
||||
# ,'country'
|
||||
# ,'lineage'
|
||||
# ,'sublineage'
|
||||
# ,'drtype'
|
||||
# , drug
|
||||
# , dr_muts_col
|
||||
# ]]
|
||||
|
||||
dr_based_cols = [drug, dr_muts_col]
|
||||
|
||||
cols_to_extract = core_cols + dr_based_cols
|
||||
|
||||
meta_data_dr = meta_data[cols_to_extract]
|
||||
|
||||
del(dr_based_cols, cols_to_extract)
|
||||
|
||||
print('expected dim should be:', len(meta_data), (len(meta_data.columns)-1) )
|
||||
print('actual dim:', meta_data_dr.shape
|
||||
, '\n===============================================================')
|
||||
|
@ -306,14 +347,22 @@ dr_id = pd.Series(dr_id)
|
|||
print('Extracting dr_muts from:', other_muts_col,'with other meta_data')
|
||||
# FIXME: replace drug with variable containing the drug name
|
||||
# !!! important !!!
|
||||
meta_data_other = meta_data[['id'
|
||||
,'country'
|
||||
,'lineage'
|
||||
,'sublineage'
|
||||
,'drtype'
|
||||
, drug
|
||||
, other_muts_col
|
||||
]]
|
||||
#meta_data_other = meta_data[['id'
|
||||
# ,'country'
|
||||
# ,'lineage'
|
||||
# ,'sublineage'
|
||||
## ,'drtype'
|
||||
# , drug
|
||||
# , other_muts_col
|
||||
# ]]
|
||||
|
||||
dr_based_cols = [drug, other_muts_col]
|
||||
|
||||
cols_to_extract = core_cols + dr_based_cols
|
||||
|
||||
meta_data_other = meta_data[cols_to_extract]
|
||||
|
||||
del(dr_based_cols, cols_to_extract)
|
||||
|
||||
print('expected dim should be:', len(meta_data), (len(meta_data.columns)-1) )
|
||||
print('actual dim:', meta_data_other.shape
|
||||
|
@ -373,7 +422,7 @@ print('Writing file:'
|
|||
, '\nExpected no. of rows:', len(common_ids)
|
||||
, '\n=============================================================')
|
||||
|
||||
common_ids.to_csv(outfile0)
|
||||
common_ids.to_csv(outfile0, index = False)
|
||||
del(out_filename0)
|
||||
|
||||
# clear variables
|
||||
|
@ -419,44 +468,15 @@ print('This is still dirty data: samples have ', gene_match, 'muts but may have
|
|||
#https://stackoverflow.com/questions/41476150/removing-space-from-dataframe-columns-in-pandas
|
||||
print('Performing tidy_split(): to separate the mutations into indivdual rows')
|
||||
|
||||
# define the split function
|
||||
def tidy_split(df, column, sep='|', keep=False):
|
||||
'''
|
||||
Split the values of a column and expand so the new DataFrame has one split
|
||||
value per row. Filters rows where the column is missing.
|
||||
|
||||
Params
|
||||
------
|
||||
df : pandas.DataFrame
|
||||
dataframe with the column to split and expand
|
||||
column : str
|
||||
the column to split and expand
|
||||
sep : str
|
||||
the string used to split the column's values
|
||||
keep : bool
|
||||
whether to retain the presplit value as it's own row
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.DataFrame
|
||||
Returns a dataframe with the same columns as `df`.
|
||||
'''
|
||||
indexes = list()
|
||||
new_values = list()
|
||||
#df = df.dropna(subset=[column])#!!!!-----see this incase you need to uncomment based on use case
|
||||
for i, presplit in enumerate(df[column].astype(str)):
|
||||
values = presplit.split(sep)
|
||||
if keep and len(values) > 1:
|
||||
indexes.append(i)
|
||||
new_values.append(presplit)
|
||||
for value in values:
|
||||
indexes.append(i)
|
||||
new_values.append(value)
|
||||
new_df = df.iloc[indexes, :].copy()
|
||||
new_df[column] = new_values
|
||||
return new_df
|
||||
|
||||
#%% end of tidy_split()
|
||||
|
||||
#TIDY SPLIT HERE
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#=========
|
||||
# DF1: dr_muts_col
|
||||
#=========
|
||||
|
@ -761,12 +781,11 @@ del(c1, c2, col_to_split1, col_to_split2, comp_gene_samples, dr_WF0, dr_df, dr_m
|
|||
out_filename1 = gene.lower() + '_ambiguous_muts.csv'
|
||||
outfile1 = outdir + '/' + out_filename1
|
||||
print('Writing file: ambiguous muts'
|
||||
, '\nFilename:', out_filename1
|
||||
, '\nPath:', outdir)
|
||||
, '\nFilename:', outfile1)
|
||||
|
||||
#common_muts = ['gene_matchVal180Phe','gene_matchGln10Pro'] # test
|
||||
inspect = gene_LF1[gene_LF1['mutation'].isin(common_muts)]
|
||||
inspect.to_csv(outfile1)
|
||||
inspect.to_csv(outfile1, index = False)
|
||||
|
||||
print('Finished writing:', out_filename1
|
||||
, '\nNo. of rows:', len(inspect)
|
||||
|
@ -1069,13 +1088,13 @@ else:
|
|||
print('FAIL: SNP has NA, Possible mapping issues from dict?'
|
||||
, '\nDebug please!'
|
||||
, '\n=========================================================')
|
||||
sys.exit()
|
||||
|
||||
out_filename2 = gene.lower() + '_mcsm_snps.csv'
|
||||
outfile2 = outdir + '/' + out_filename2
|
||||
|
||||
print('Writing file: mCSM style muts'
|
||||
, '\nFilename:', out_filename2
|
||||
, '\nPath:', outdir
|
||||
, '\nFilename:', outfile2
|
||||
, '\nmutation format (SNP): {WT}<POS>{MUT}'
|
||||
, '\nNo. of distinct muts:', len(snps_only)
|
||||
, '\nNo. of distinct positions:', len(pos_only)
|
||||
|
@ -1083,7 +1102,7 @@ print('Writing file: mCSM style muts'
|
|||
|
||||
snps_only.to_csv(outfile2, header = False, index = False)
|
||||
|
||||
print('Finished writing:', out_filename2
|
||||
print('Finished writing:', outfile2
|
||||
, '\nNo. of rows:', len(snps_only)
|
||||
, '\nNo. of cols:', len(snps_only.columns)
|
||||
, '\n=============================================================')
|
||||
|
@ -1099,7 +1118,7 @@ print('Writing file: LF formatted data'
|
|||
, '\n============================================================')
|
||||
|
||||
gene_LF1.to_csv(outfile3, header = True, index = False)
|
||||
print('Finished writing:', out_filename3
|
||||
print('Finished writing:', outfile3
|
||||
, '\nNo. of rows:', len(gene_LF1)
|
||||
, '\nNo. of cols:', len(gene_LF1.columns)
|
||||
, '\n=============================================================')
|
||||
|
@ -1118,11 +1137,11 @@ all_muts_msa.columns.dtype
|
|||
all_muts_msa_sorted = all_muts_msa.sort_values(by = 'mutationinformation')
|
||||
|
||||
# create an extra column with protein name
|
||||
all_muts_msa_sorted = all_muts_msa_sorted.assign(fasta_name = '3PL1')
|
||||
all_muts_msa_sorted.head()
|
||||
#all_muts_msa_sorted = all_muts_msa_sorted.assign(fasta_name = '3PL1')
|
||||
#all_muts_msa_sorted.head()
|
||||
|
||||
# rearrange columns so the fasta name is the first column (required for mutate.script)
|
||||
all_muts_msa_sorted = all_muts_msa_sorted[['fasta_name', 'mutationinformation']]
|
||||
#all_muts_msa_sorted = all_muts_msa_sorted[['fasta_name', 'mutationinformation']]
|
||||
all_muts_msa_sorted.head()
|
||||
|
||||
print('Checking NA in snps...')# should be 0
|
||||
|
@ -1138,15 +1157,14 @@ out_filename4 = gene.lower() +'_all_muts_msa.csv'
|
|||
outfile4 = outdir + '/' + out_filename4
|
||||
|
||||
print('Writing file: mCSM style muts for msa',
|
||||
'\nFilename:', out_filename4,
|
||||
'\nPath:', outdir,
|
||||
'\nFilename:', outfile4,
|
||||
'\nmutation format (SNP): {WT}<POS>{MUT}',
|
||||
'\nNo.of lines of msa:', len(all_muts_msa),
|
||||
)
|
||||
|
||||
all_muts_msa_sorted.to_csv(outfile4, header = False, index = False)
|
||||
|
||||
print('Finished writing:', out_filename4
|
||||
print('Finished writing:', outfile4
|
||||
, '\nNo. of rows:', len(all_muts_msa)
|
||||
, '\nNo. of cols:', len(all_muts_msa.columns)
|
||||
, '\n=============================================================')
|
||||
|
@ -1177,7 +1195,7 @@ print('Writing file: mutational positions'
|
|||
|
||||
pos_only_sorted.to_csv(outfile5, header = True, index = False)
|
||||
|
||||
print('Finished writing:', out_filename5
|
||||
print('Finished writing:', outfile5
|
||||
, '\nNo. of rows:', len(pos_only_sorted)
|
||||
, '\nNo. of cols:', len(pos_only_sorted.columns)
|
||||
, '\n=============================================================')
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue