#!/usr/bin/env python
"""
Command line script for neo epitope prediction
Created March 2018
Copyright (C) Damien Farrell
"""
from __future__ import absolute_import, print_function
import sys, os, subprocess
import time
import pickle
import numpy as np
import pandas as pd
pd.set_option('display.width', 150)
pd.set_option('max_colwidth', 80)
from collections import OrderedDict
from epitopepredict import base, config, analysis, sequtils, peptutils, tepitope
defaultpath = os.getcwd()
sim_matrix = tepitope.get_matrix('pmbec')
metrics = ['score', 'matched_score', 'binding_diff','perc_rank',
'wt_similarity','self_similarity', 'virus_similarity',
'anchor','hydro', 'net_charge']
[docs]class NeoEpitopeWorkFlow(object):
"""Class for implementing a neo epitope workflow."""
def __init__(self, opts={}):
for i in opts:
self.__dict__[i] = opts[i]
return
[docs] def setup(self):
"""Setup main parameters"""
if check_imports() == False:
return
#check_ensembl()
pd.set_option('display.width', 120)
base.iedbmhc1path = self.iedbmhc1_path
base.iedbmhc2path = self.iedbmhc2_path
self.vcf_files = self.vcf_files.split(',')
f = self.vcf_files[0]
fileext = os.path.splitext(f)[1]
if fileext == '.txt' and os.path.exists(f):
print ('found file list')
self.vcf_files = read_names(f)
self.mhc1_alleles = self.mhc1_alleles.split(',')
self.mhc2_alleles = self.mhc2_alleles.split(',')
if len(self.mhc1_alleles)==0 and len(self.mhc2_alleles)==0:
return False
self.predictors = self.predictors.split(',')
for p in self.predictors:
if p not in base.predictors:
print ('unknown predictor in config file. Use:')
show_predictors()
return False
if self.mhc1_alleles[0] in base.mhc1_presets:
self.mhc1_alleles = base.get_preset_alleles(self.mhc1_alleles[0])
elif self.mhc2_alleles[0] in base.mhc2_presets:
self.mhc2_alleles = base.get_preset_alleles(self.mhc2_alleles[0])
if type(self.cutoffs) is int or type(self.cutoffs) is float:
self.cutoffs = [self.cutoffs]
else:
self.cutoffs = [float(i) for i in self.cutoffs.split(',')]
self.names=None
if not os.path.exists(self.path) and self.path != '':
os.mkdir(self.path)
return True
[docs] def get_file_labels(self, files):
l=OrderedDict()
for f in files:
if not os.path.exists(f):
print ('no such file %s' %f)
continue
label = os.path.splitext(os.path.basename(f))[0]
l[label] = {'filename':f}
return l
[docs] def run(self):
"""Run workflow for multiple samples and prediction methods."""
print ('running neoepitope predictions')
start = time.time()
path = self.path
overwrite = self.overwrite
files = self.vcf_files
preds = self.predictors
labels = self.get_file_labels(files)
cutoffs = self.cutoffs
if len(cutoffs) < len(preds) :
cutoffs = [cutoffs[0] for p in preds]
for f in labels:
print ('sample name: %s' %f)
infile = labels[f]['filename']
#file to save variants to, if present we can skip
eff_csv = os.path.join(path, 'variant_effects_%s.csv' %f)
eff_obj = os.path.join(path, 'variant_effects_%s.pickle' %f)
if not os.path.exists(eff_obj) or overwrite == True:
#get variant effects for each file and then iterate over predictors
variants = load_variants(vcf_file=infile)
labels[f]['variants'] = len(variants)
print ('getting variant effects')
effects = get_variants_effects(variants, self.verbose)
#serialize variant effects
effects_to_pickle(effects, eff_obj)
else:
#else reload from last run
effects = pickle.load(open(eff_obj,'rb'))
#save effects as table
eff_data = effects_to_dataframe(effects)
eff_data['sample'] = f
eff_data.to_csv(eff_csv)
#get mutated peptides
seqs = get_mutant_sequences(effects=effects, length=self.mhc1_length, verbose=self.verbose)
#get similarities
df = get_closest_matches(seqs, self.verbose, cpus=self.cpus)
i=0
for predictor in self.predictors:
outfile = os.path.join(path, 'results_%s_%s.csv' %(f,predictor))
if os.path.exists(outfile) and overwrite == False:
continue
if predictor in base.mhc1_predictors:
alleles = self.mhc1_alleles
#length = self.mhc1_length
else:
alleles = self.mhc2_alleles
#length = self.mhc2_length
res = predict_binding(df, alleles=alleles, predictor=predictor,
verbose=self.verbose, cpus=self.cpus)
res['label'] = f
res.to_csv(outfile, index=False)
#gets promiscuous binders based on the cutoff
#P = base.get_predictor(predictor)
#P.data = res
#pb = P.promiscuous_binders(n=1, keep_columns=True, cutoff=cutoffs[i])
#pb['label'] = f
#print (pb[:20])
#pb.to_csv(os.path.join(path, 'binders_%s_%s.csv' %(f,predictor)), index=False)
i+=1
#combine results if multiple predictors?
#combine_results()
#combine results for multiple samples?
#save sample labels
pd.DataFrame(labels).T.to_csv(os.path.join(path, 'sample_labels.csv'))
print ('finished, results saved to %s' %path)
end = round(time.time()-start,1)
print ('took %s seconds' %end)
return
[docs] def combine_samples(self, labels):
"""Put peptides from multiple files in one table"""
res=[]
for i,r in labels:
df=pd.read_csv('results_%s_tepitope.csv' %r.filename)
res.append(df)
res=pd.concat(res)
pd.pivot_table(res, index=['peptide'], columns=['label'], values='score')
return
[docs]def pbmec_score(seq1, seq2):
"""Score with PBMEC matrix"""
x=0
try:
for i in seq1:
for j in seq2:
x+=sim_matrix[i][j]
except:
return -1
return x
[docs]def get_alleles(f):
"""Get input alleles"""
fileext = os.path.splitext(f)[1]
if fileext == '.txt' and os.path.exists(f):
items = read_names(f)
else:
items = f.split(',')
return items
[docs]def read_names(filename):
"""read plain text file of items"""
with open(filename) as f:
p = f.readlines()
p = [x.strip() for x in p]
p = list(filter(None, p))
return p
[docs]def variants_from_csv(csv_file, sample_id=None, reference=None):
"""Variants from csv file.
Args:
csv_file: csv file with following column names-
chromosome, position, reference_allele, alt_allele, gene_name, transcript_id, sample_id
sample_id: if provided, select variants only for this id
reference: ref genome used for variant calling
"""
from pyensembl import ensembl_grch38
import varcode
from varcode import Variant
df = pd.read_csv(csv_file)
variants=[]
if sample_id != None and 'sample_id' in df.columns:
df = df[df.sample_id==sample_id]
df = df.drop_duplicates(['POS','REF','ALT'])
for i,r in list(df.iterrows()):
#print i
v = Variant(contig=r.CHROM, start=r.POS, ref=r.REF, alt=r.ALT, ensembl=ensembl_grch38)
variants.append(v)
varcl = varcode.variant_collection.VariantCollection(variants)
return varcl
[docs]def dataframe_to_vcf(df, outfile):
"""Write a dataframe of variants to a simple vcf file. Dataframe requires
the following columns: #CHROM','POS','ID','REF','ALT'
"""
f = open(outfile, 'w')
f.write('##fileformat=VCFv4.0\n')
f.write('##reference=GRCh38.fa\n')
f.write('##source=csv\n')
f.write('##FILTER=<ID=PASS,Description="Passed all filters">\n')
f.write('##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n')
df = df.rename(columns={'CHROM':'#CHROM'})
df['ID']='.'
df['FILTER']='PASS'
df['QUAL'] = 60
df['INFO'] = '.'
df['FORMAT'] = 'GT'
df['sample'] = '0/1'
cols = ['#CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT','sample']
df[cols].to_csv(f, sep='\t',index=False)#,align='left')
return
[docs]def get_variant_class(effect):
import varcode
v = effect.variant
if v.is_deletion:
return 'deletion'
elif v.is_insertion:
return 'insertion'
elif v.is_snv:
return 'snv'
elif v.is_indel:
return 'indel'
elif type(effect) is varcode.effects.effect_classes.FrameShift:
return 'frameshift'
[docs]def effects_to_pickle(effects, filename):
"""serialize variant effects collections"""
pickle.dump(effects, open(filename, "wb"), protocol=2)
return
[docs]def effects_to_dataframe(effects):
x=[]
for eff in effects:
if eff is None:
continue
d=OrderedDict()
d['gene_name'] = eff.gene_name
d['transcript_id'] = eff.transcript_id
wt = eff.original_protein_sequence
mut = eff.mutant_protein_sequence
vloc = eff.aa_mutation_start_offset
d['aa_change'] = eff.short_description
d['mutation'] = eff.variant.short_description
d['variant_class'] = get_variant_class(eff)
#d['wt_sequence'] = wt
#d['mut_sequence'] = mut
x.append(d)
df = pd.DataFrame(x)
df['chr'] = df.apply(lambda x: x.mutation.split(' ')[0],1)
return df
'''def filter_variant_effects(effects, verbose=False):
"""Get fitered list of effects.
Omits silent and noncoding effects.
Returns:
list of varcode variant effect objects
"""
effs = effects.drop_silent_and_noncoding()
filt = []
for eff in effs:
v = eff.variant
mut = eff.mutant_protein_sequence
#if type(eff) is varcode.effects.effect_classes.FrameShift:
# print eff, eff.shifted_sequence
if mut is None:
continue
vloc = eff.aa_mutation_start_offset
if vloc is None or len(v.coding_genes) == 0:
continue
if verbose==True:
print (v.gene_names, type(eff))
print (eff.transcript_id, eff.short_description, eff.variant.ref)
#print v.contig
#print mut
#print eff.to_dict()['variant'].to_dict()
filt.append(eff)
return filt'''
[docs]def get_variants_effects(variants, verbose=False, gene_expression_dict=None):
"""Get all effects from a list of variants.
Returns:
list of varcode variant effect objects"""
from varcode.effects import Substitution, Insertion, Deletion
effects = variants.effects()
effects = effects.drop_silent_and_noncoding()
#print (len(effects))
effects = effects.filter_by_effect_priority(Substitution)
if gene_expression_dict is not None:
effects = effects.filter_by_gene_expression(gene_expression_dict)
print ('%s effects from %s variants' %(len(effects),len(variants)))
return effects
[docs]def peptides_from_effect(eff, length=11, peptides=True, verbose=False):
"""Get mutated peptides from a single effect object.
Returns:
dataframe with peptides and variant info
"""
import varcode
pad = length-1
if eff==None:
return
gene = eff.gene_name
varclass = get_variant_class(eff)
orig = eff.original_protein_sequence
mut = eff.mutant_protein_sequence
vloc = eff.aa_mutation_start_offset
if vloc is None or mut is None:
return
st = vloc-pad; end = vloc+pad+1
if st<0: st=0
#if frameshift changed sequence may be long
if type(eff) is varcode.effects.effect_classes.FrameShift:
#mutpep = mut[vloc:]
mutpep = eff.shifted_sequence
#print (mutpep)
wt = None
#print (type(eff), len(orig), len(mut), vloc, len(mutpep), mutpep, mut[vloc:], wt)
#print (mut)
else:
mutpep = mut[st:end]
if varclass == 'snv':
wt = orig[st:end]
else:
wt = None
#if verbose == True:
# print (type(eff), len(orig), len(mut), vloc, st, end, len(mutpep))
if len(mutpep)<length:
#if verbose == True:
# print ('peptide length too small')
return
if peptides is True:
df = peptutils.get_fragments(seq=mutpep, length=length)
df['pos'] = pd.Series(range(st,end))
df['prot_length_ratio'] = len(mut)/float(len(orig))
if wt != None:
wdf = peptutils.get_fragments(seq=wt, length=length)
df['wt'] = wdf.peptide
else:
df['wt'] = None
#print (df)
else:
#just return the mutated protein
df = pd.DataFrame.from_dict([{'wt_sequence':orig,'mutant_sequence': mut}])
df['pos'] = vloc
#print gene,st,end,mutpep
#print df
df['name'] = gene
df['transcript_id'] = eff.transcript_id
#df['transcript_name'] = eff.transcript_name
df['aa_change'] = eff.short_description
df['mutation'] = eff.variant.short_description
df['variant_class'] = varclass
return df
[docs]def get_mutant_sequences(variants=None, effects=None, reference=None, peptides=True,
drop_duplicates=True, length=11, verbose=False):
"""
Get mutant proteins or peptide fragments from vcf or maf file.
Args:
variants: varcode variant collection
effects: non-synonmymous effects, alternative to variants
peptides: get peptide fragments around mutation
Returns:
pandas dataframe with mutated peptide sequence and source information
"""
res = []
if variants is not None:
effects = get_variants_effects(variants, verbose)
if effects is None:
print ('no variant information')
return
for eff in effects:
#print (eff)
peps = peptides_from_effect(eff, length=length, peptides=peptides, verbose=verbose)
if peps is None:
continue
res.append(peps)
res = pd.concat(res).reset_index(drop=True)
#remove rows where mut same as wt peptide
res = res[res.peptide!=res.wt]
if drop_duplicates == True:
res = res.drop_duplicates('peptide')
print ('%s sequences/peptides from %s effects' %(len(res),len(effects)))
return res
[docs]def load_variants(vcf_file=None, maf_file=None, max_variants=None):
"""Load variants from vcf file"""
import varcode
if vcf_file is not None:
variants = varcode.load_vcf(vcf_file, allow_extended_nucleotides=True, max_variants=max_variants)
f=vcf_file
elif maf_file is not None:
variants = varcode.load_maf(maf_file)
f=maf_file
print ('%s variants read from %s' %(len(variants),f))
return variants
[docs]def get_closest_matches(df, verbose=False, cpus=1):
"""
Find peptide similarity metrics
"""
if verbose == True:
print ('finding matches to self proteome')
#find matches to self proteome, adds penalty score column to df
#we should only blast non-duplicates....
df = self_matches(df, cpus=cpus)
if verbose == True:
print ('finding matches to viral proteomes')
df = virus_matches(df, cpus=cpus)
#get similarity scores for wt and closest match to proteome
matrix = 'pmbec'
df['wt_similarity'] = df.apply(lambda x: wt_similarity(x, matrix=matrix),1)
df['self_similarity'] = df.apply(lambda x: self_similarity(x, matrix=matrix),1)
df['virus_similarity'] = df.apply(lambda x: virus_similarity(x, matrix=matrix),1)
#get closest peptide in another column, either wt or nearest self
df['closest'] = df.apply(get_closest_match, 1)
df['length'] = df.peptide.str.len()
#exclude exact matches to self?
if verbose==True:
print ('%s peptides with exact matches to self' %len(df[df.self_mismatches==0]))
return df
[docs]def predict_binding(df, predictor='netmhcpan', alleles=[],
verbose=False, cpus=1, cutoff=.95, cutoff_method='default'):
"""
Predict binding scores for mutated and wt peptides (if present) from supplied variants.
Args:
df: pandas dataframe with peptide sequences, requires at least 2 columns
'peptide' - the mutant peptide
'wt' - a corresponding wild type peptide
this data could be generated from get_mutant_sequences or from an external program
predictor: mhc binding prediction method
alleles: list of alleles
Returns:
dataframe with mutant and wt binding scores for all alleles
"""
P = base.get_predictor(predictor, scoring='ligand')
print (P)
print ('predicting mhc binding for %s peptides with %s' %(len(df), P.name))
peps = list(df.peptide)
res = P.predict_peptides(peps, alleles=alleles, cpus=cpus,
cutoff=cutoff, cutoff_method=cutoff_method, drop_columns=True)
if res is None:
print ('no binding predictions!')
return
#predict closest matching peptide affinity
if verbose == True:
print ('predicting wt peptides')
wtpeps = list(df.closest)
#print wild type peptides
b_wt = P.predict_peptides(wtpeps, alleles=alleles, cpus=cpus,
cutoff=cutoff, cutoff_method=cutoff_method, drop_columns=True)
#combine mutant and matching binding predictions
res = combine_wt_scores(res, b_wt, P.scorekey)
res = res.drop(['pos','name'],1)
#combine binding results with main dataframe
res = df.merge(res, on='peptide')
res['binding_diff'] = res[P.scorekey]/res.matched_score
#anchor position mutated in any 9-mers
res['anchor'] = res.apply(anchor_mutated, 1)
#hydrophobicity and net charge
res = analysis.peptide_properties(res, 'peptide')
res['length'] = res.peptide.str.len()
#merge promiscuity measure into results
#if len(pb) > 0:
# res = res.merge(pb[['peptide','alleles']], on='peptide',how='left')
#else:
# res['alleles'] = 0
#rename some columns
res = res.rename(columns={'rank':'binding_rank','alleles':'promiscuity'})
res = res.sort_values('binding_rank', ascending=True)
return res
[docs]def score_peptides(df, rf=None):
"""Score peptides with a classifier. Returns a prediction probability."""
if rf is None:
from sklearn.externals import joblib
rf = joblib.load(os.path.join(base.datadir,'rf_model.joblib'))
X = df[metrics]
X = X.fillna(X.mean())
X = X.replace(np.inf,.1)
sc = rf.predict_proba(X)[:,1]
return sc
[docs]def combine_wt_scores(x, y, key):
"""Combine mutant peptide and matching wt/self binding scores from a
set of predictions. Assumes both dataframes were run with the same alleles.
Args:
x,y: pandas dataframes with matching prediction results
key:
"""
x = x.sort_values(['pos','allele']).reset_index(drop=True)
y = y.sort_values(['pos','allele']).reset_index(drop=True)
x['matched_score'] = y[key]
return x
[docs]def make_blastdb(url, name=None, filename=None, overwrite=False):
"""Download protein sequences and a make blast db. Uses datacache module."""
import datacache
cachedir = datacache.get_data_dir()
blastdb = os.path.join(cachedir, name)
if os.path.exists(blastdb+'.phr') and overwrite==False:
#print ('blast files found')
return blastdb
filename = datacache.fetch_file(url, filename=filename, decompress=True, subdir=None)
#print filename
cmd = 'makeblastdb -dbtype prot -in %s -out %s' %(filename,blastdb)
#print cmd
tmp=subprocess.check_output(cmd, shell=True)
return blastdb
[docs]def make_human_blastdb():
"""Human proteome blastdb"""
url = 'ftp://ftp.ensembl.org/pub/release-87/fasta/homo_sapiens/pep/Homo_sapiens.GRCh38.pep.all.fa.gz'
filename = 'Homo_sapiens.GRCh38.pep.all.fa.gz'
blastdb = make_blastdb(url, name='GRCh38', filename=filename)
return blastdb
[docs]def make_virus_blastdb():
"""Human virus blastdb"""
url = 'http://www.uniprot.org/uniprot/?sort=score&desc=&compress=no&query=taxonomy:%22Viruses%20[10239]%22%20\
keyword:%22Reference%20proteome%20[KW-1185]%22%20host:%22Homo%20sapiens%20(Human)%20[9606]%22&fil=&force=no&preview=true&format=fasta'
filename = 'uniprot_human_virus_proteome.fa.gz'
blastdb = make_blastdb(url, name='human_virus', filename=filename)
return blastdb
[docs]def self_matches(df, **kwargs):
blastdb = make_human_blastdb()
x = find_matches(df, blastdb, **kwargs)
x = x.rename(columns={'sseq':'self_match','mismatch':'self_mismatches'})
return x
[docs]def virus_matches(df, **kwargs):
blastdb = make_virus_blastdb()
x = find_matches(df, blastdb, **kwargs)
if 'sseq' in x.columns:
x = x.rename(columns={'sseq':'virus_match','mismatch':'virus_mismatches'})
else:
x['virus_match'] = None
return x
[docs]def find_matches(df, blastdb, cpus=4, verbose=False):
"""
Get similarity measures for peptides to a self proteome. Does a
local blast to the proteome and finds most similar matches. These
can then be scored.
Args:
df: dataframe of peptides
blastdb: path to protein blastdb
Returns:
dataframe with extra columns: 'sseq','mismatch'
"""
if verbose == True:
print ('blasting %s peptides' %len(df))
length = df.peptide.str.len().max()
def check_mm(x):
#corrected mismatches for shorter hits
if x.length<length:
return length-x.length+x.mismatch
else:
return x.mismatch
bl = sequtils.blast_sequences(blastdb, df.peptide, evalue=200000, cpus=cpus,
ungapped=True, gapopen=10, gapextend=2, qcov_hsp_perc=100,
comp_based_stats=0)
if len(bl) == 0:
if verbose == True:
print ('no hits found!')
return df
if verbose == True:
print ('%s hits' %len(bl))
cols = ['qseqid','sseq','mismatch']
#ignore any hits with gaps
bl = bl[(bl.gapopen==0)]# & (bl.length>=length)]
#take longest hit with lowest e-value for each query
bl = bl.sort_values(['qseqid','length','evalue'],ascending=(True,False,True))
bl = bl.groupby(['qseqid'],as_index=False).first()
#correct mismatches to account for shorter hits
bl['mismatch'] = bl.apply(check_mm, 1)
bl = bl[cols]
#merge results
x = df.merge(bl,left_on='peptide',right_on='qseqid', how='left')
x = x.sort_values(by='mismatch',ascending=True)
x = x.drop(['qseqid'],1)
#x['exact_match'] = x.mismatch.clip(0,1).fillna(1)
return x
[docs]def wt_similarity(x, matrix='blosum62'):
x1 = x.peptide
x2 = x.wt
#print(x1,x2)
matrix = tepitope.get_matrix(matrix)
return tepitope.similarity_score(matrix,x1,x2)
[docs]def self_similarity(x, matrix='blosum62'):
if x.self_match is None:
return
x1 = x.peptide
x2 = x.self_match
matrix = tepitope.get_matrix(matrix)
return tepitope.similarity_score(matrix,x1,x2)
[docs]def virus_similarity(x, matrix='blosum62'):
if x.virus_match is None:
return
x1 = x.peptide
x2 = x.virus_match
matrix = tepitope.get_matrix(matrix)
return tepitope.similarity_score(matrix,x1,x2)
[docs]def get_closest_match(x):
"""Create columns with closest matching peptide.
If no wt peptide use self match. vector method"""
if x.wt is None:
return x.self_match
else:
return x.wt
[docs]def anchor_mutated(x):
return peptutils.compare_anchor_positions(x.wt, x.peptide)
[docs]def summary_plots(df):
"""summary plots for testing results"""
f,axs=plt.subplots(2,2,figsize=(10,10))
axs=axs.flat
g = df.groupby(['name']).size().sort_values(ascending=False)[:20]
g.plot(kind='barh',ax=axs[0],color='gray')
axs[0].set_title('peptide counts')
df.variant_class.value_counts().plot(kind='pie',autopct='%.1f',ax=axs[1])
axs[1].set_title('variant classes')
df.self_mismatches.value_counts().sort_index().plot(kind='bar',ax=axs[2])
axs[2].set_title('mismatches to self')
#df.wt_similarity.hist(ax=axs[3])
#df.plot('wt_similarity','self_similarity',kind='scatter',ax=axs[3])
df.plot('score','matched_score',kind='scatter',ax=axs[3])
return
[docs]def show_predictors():
for p in base.predictors:
print(p)
[docs]def check_imports():
try:
import varcode
except Exception as e:
print (e)
print ('varcode required. please run pip install varcode')
return False
return True
[docs]def fetch_ensembl_release(path=None, release='75'):
"""Get pyensembl genome files"""
from pyensembl import Genome,EnsemblRelease
#this call should download the files
genome = EnsemblRelease(release, species='human')
genome.download(overwrite=False)
genome.index(overwrite=False)
genome.cache_directory_path = path
print ('pyensembl genome files cached in %s' %genome.cache_directory_path)
#run_pyensembl_install()
return
[docs]def check_ensembl(release='75'):
"""Check pyensembl ref genome cached. Needed for running in snap"""
#check if running inside a snap package so we can download
#the genome files for pyensembl
cache_dir=None
if base.check_snap() is True:
#home = os.path.join('/home', os.environ['USER'])
home = os.environ['SNAP_USER_COMMON']
cache_dir = os.path.join(home, '.cache')
os.environ['PYENSEMBL_CACHE_DIR'] = cache_dir
print ('checking for ref human genome')
fetch_ensembl_release(cache_dir, release)
return
[docs]def run_vep(vcf_file, out_format='vcf', assembly='GRCh38', cpus=4, path=None):
"""Run ensembl VEP on a vcf file for use with pvacseq.
see https://www.ensembl.org/info/docs/tools/vep/script/index.html
"""
fname = os.path.splitext(vcf_file)[0]
out = fname+'.vep.%s' %out_format
if path == None:
path = '/local/ensembl-vep/'
path = os.path.join(path,'./vep')
cmd = '{p} --input_file {i} --pick --force_overwrite \
--assembly {a} --fork {c} \
--symbol --terms SO --output_file {o} \
--plugin Downstream --plugin Wildtype \
--cache --offline'.format(o=out,i=vcf_file,a=assembly,c=cpus,p=path)
if out_format == 'vcf':
cmd += ' --format vcf --vcf'
print (cmd)
tmp = subprocess.check_output(cmd, shell=True)
return
[docs]def print_help():
print ("""use -h to get options""")
[docs]def plot_variant_summary(data):
from bokeh.plotting import figure
from bokeh.charts import Donut
d = Donut(df, label=['abbr', 'medal'], values='medal_count',
text_font_size='8pt', hover_text='medal_count')
return d
[docs]def test_run():
"""Test run for sample vcf file"""
print ('neoepitope workflow test')
path = os.path.dirname(os.path.abspath(__file__))
options = config.baseoptions
options['base']['predictors'] = 'netmhcpan,tepitope'
options['base']['mhc1_alleles'] = 'HLA-A*02:01'
options['base']['path'] = 'neo_test'
options['base']['overwrite'] = True
#options['base']['mhc2_length'] = 11
#options['base']['verbose'] = True
options['base']['cpus'] = 2
options['neopredict']['vcf_files'] = os.path.join(path, 'testing','input.vcf')
options['neopredict']['release'] = '75'
options = config.check_options(options)
#print (options)
W = NeoEpitopeWorkFlow(options)
check_ensembl(release='75')
st = W.setup()
#check_ensembl()
W.run()
[docs]def varcode_test():
path = os.path.dirname(os.path.abspath(__file__))
infile = os.path.join(path, 'testing','input.vcf')
variants = load_variants(vcf_file=infile)
get_variants_effects(variants)
return