Source code for epitopepredict.base

#!/usr/bin/env python

"""
    MHC prediction base module for core classes
    Created November 2013
    Copyright (C) Damien Farrell
    This program is free software; you can redistribute it and/or
    modify it under the terms of the GNU General Public License
    as published by the Free Software Foundation; either version 3
    of the License, or (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

"""

from __future__ import absolute_import, print_function
import sys, os, shutil, string
import csv, glob, pickle, tempfile
import time, io
import operator as op
import re, types
try:
    unicode = unicode
except NameError:
    #unicode is undefined, must be Python 3
    str = str
    unicode = str
    basestring = (str,bytes)
import math
import subprocess
from subprocess import CalledProcessError
import numpy as np
import pandas as pd
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from . import utilities, peptutils, sequtils, tepitope, config, mhclearn

#write a default config if not present
config_file = config.write_default_config()
home = os.path.expanduser("~")
config_path = os.path.join(home, '.config/epitopepredict')
module_path = os.path.dirname(os.path.abspath(__file__)) #path to module
datadir = os.path.join(module_path, 'mhcdata')
predictors = ['basicmhc1','tepitope','netmhciipan','netmhcpan','mhcflurry','iedbmhc1','iedbmhc2']
iedbmhc1_methods = ['ann', 'IEDB_recommended', 'comblib_sidney2008', 'consensus', 'smm', 'smmpmbec']
mhc1_predictors = ['basicmhc1','netmhcpan','iedbmhc1','mhcflurry'] + iedbmhc1_methods
iedbsettings = {'cutoff_type': 'none', 'pred_method': 'IEDB_recommended',
            'output_format': 'ascii', 'sort_output': 'position_in_sequence',
            'sequence_format': 'auto', 'allele': 'HLA-DRB1*01:01', 'length':'11',
            'sequence_file': None}

mhc1_presets = ['mhc1_supertypes','us_caucasion_mhc1','us_african_mhc1','broad_coverage_mhc1']
mhc2_presets = ['mhc2_supertypes','human_common_mhc2','bovine_like_mhc2']

#sequence for testing
testsequence = ('MRRVILPTAPPEYMEAIYPVRSNSTIARGGNSNTGFLTPESVNGDTPSNPLRPIADDTIDHASHTPGSVS'
               'SAFILEAMVNVISGPKVLMKQIPIWLPLGVADQKTYSFDSTTAAIMLASYTITHFGKATNPLVRVNRLGP'
               'GIPDHPLRLLRIGNQAFLQEFVLPPVQLPQYFTFDLTALKLITQPLPAATWTDDTPTGSNGALRPGISFH'
               'PKLRPILLPNKSGKKGNSADLTSPEKIQAIMTSLQDFKIVPIDPTKNIMGIEVPETLVHKLTGKKVTSKN'
               'GQPIIPVLLPKYIGLDPVAPGDLTMVITQDCDTCHSPASLPAVIEK')
presets_dir = os.path.join(module_path, 'presets')

#genome for testing
mtb_genome = os.path.join(datadir, 'MTB-H37Rv.gb')

[docs]def read_defaults(): """Get some global settings such as program paths from config file""" cp = config.parse_config(config_file) options = config.get_options(cp) return options
#get defaults defaults = read_defaults()
[docs]def predict_proteins_worker(P, recs, kwargs): try: df = P._predict_sequences(recs, **kwargs) except KeyboardInterrupt: return return df
[docs]def predict_peptides_worker(P, recs, kwargs): try: df = P._predict_peptides(recs, **kwargs) except KeyboardInterrupt: return return df
[docs]def get_preset_alleles(name): """A list of the possible preset alleles """ df = pd.read_csv(os.path.join(presets_dir, name+'.csv'),comment='#') return list(df.allele)
[docs]def first(x): return x.iloc[0]
[docs]def get_pos(x): x['pos'] = np.arange(len(x)) return x
[docs]def get_iedb_request(seq, alleles='HLA-DRB1*01:01', method='consensus3'): import requests url = 'http://tools.iedb.org/tools_api/mhcii/' values = {'method' : method, 'sequence_text' : seq, 'allele' : alleles } r=requests.post(url,data=values) df=pd.read_csv(io.StringIO(r.content),sep='\t') return df
[docs]def get_overlapping(index, s, length=9, cutoff=25): """Get all mutually overlapping kmers within a cutoff area""" g=[s] vals = [i for i in range(s, s+cutoff) if i in index] for i in range(len(vals)-1): if vals[i+1]<=vals[i]+length: g.append(vals[i+1]) else: break return g
[docs]def get_predictor_classes(): """Get predictor classes in this module.""" cl={} for o in Predictor.__subclasses__(): cl[o._get_name()] = o return cl
[docs]def get_predictor(name='tepitope', **kwargs): """Get a predictor object using it's name. Valid predictor names are held in the predictors attribute.""" cl = get_predictor_classes() if name in iedbmhc1_methods: name = 'iedbmhc1' if name in cl: return cl[name](**kwargs) else: print ('no such predictor %s' %name) print ('valid names are %s' %', '.join(predictors)) return
[docs]def get_length(data): """Get peptide length of a dataframe of predictions""" if len(data)>0: return len(data.head(1).peptide.max()) return
[docs]def get_coords(df): """Get start end coords from position and length of peptides""" if 'start' in df.columns: return df df['start'] = df.pos.astype(int) df['end'] = ( df.pos + df.peptide.str.len() ).astype(int) return df
[docs]def sequence_from_peptides(df): """Derive sequence from set of peptides""" l = get_length(df) df = df.drop_duplicates('pos').sort_values('pos') x = df.peptide.str[0] last = df.iloc[-1].peptide[1:] x = ''.join(x) x = x + last return x
[docs]def seq_from_binders(df): x = pd.Series(df.sort_values('pos').peptide.unique()) return ''.join(x.str[0])
[docs]def write_fasta(sequences, id=None, filename='tempseq.fa'): """Write a fasta file of sequences""" if isinstance(sequences, basestring): sequences = [sequences] out = open(filename, 'w') i=1 for seq in sequences: if id == None: id='temp%s'%i SeqIO.write(SeqRecord(Seq(seq),id, description='temp'), out, 'fasta') i+=1 out.close() return filename
[docs]def get_sequence(seqfile): """Get sequence from fasta file""" recs = list(SeqIO.parse(seqfile, 'fasta'))[0] sequence = recs.seq.tostring() return sequence
[docs]def clean_sequence(seq): """clean a sequence of invalid characters before prediction""" import re new = re.sub('[-*_#X]', '', seq) return new
[docs]def get_nearest(df): """Get nearest binder""" grps = df.groupby('name') new = [] def closest(x,g): if len(g.pos)==1: return 1 return min([abs(x.pos-i) for i in g.pos if i!=x.pos]) for i,g in grps: positions = g.pos g['nearest'] = g.apply(lambda x: closest(x,g),axis=1) new.append(g) df = pd.concat(new) return df
[docs]def summarize(data): """Summarise prediction data""" #print 'binders with unique cores: %s' %len(self.getUniqueCores(binders=True)) allelegrps = data.groupby('allele') proteins = data.groupby('name') print ('summary: %s peptides in %s proteins and %s alleles' %(len(data), len(proteins),len(allelegrps))) return
[docs]def get_filenames(path, names=None, file_limit=None): if not os.path.exists(path): print('no such path %s' %path) return files = glob.glob(os.path.join(path, '*.csv')) if names is not None: names = [n+'.csv' for n in names] files = [n for n in files if os.path.basename(n) in names] if len(files) == 0: return if file_limit != None: files = files[:file_limit] return files
[docs]def results_from_csv(path=None, names=None, compression='infer', file_limit=None): """ Load results for multiple csv files in a folder or a single file. Args: path: name of a csv file or directory with one or more csv files names: names of proteins to load file_limit: limit to load only the this number of proteins """ if os.path.isfile(path): data = pd.read_csv(path, index_col=0) elif os.path.isdir(path): files = get_filenames(path, names, file_limit) if files == None: return res = [] for f in files: df = pd.read_csv(f, index_col=0, compression=compression) if len(df) == 0: continue #if not self.scorekey in df.columns: # continue res.append(df) data = pd.concat(res) else: return return data
[docs]def get_quantiles(predictor): """Get quantile score values per allele in set of predictions. Used for making pre-defined cutoffs. Args: predictor: predictor with set of predictions """ P=predictor res=[] value = P.scorekey data=P.data for a,g in data.groupby('allele'): q = g[value].quantile(np.arange(0,1,.01)) q.name=a res.append(q) res = pd.DataFrame(res) if P.rankascending==1: res.columns = res.columns.sort_values(ascending=False) return res
[docs]def get_standard_mhc1(name): """Taken from iedb mhc1 utils.py""" temp = name.strip().split('-') length = temp[-1] mhc = '-'.join(temp[0:-1]) return mhc
[docs]def get_drb_list(a): """Get DRB list in standard format""" s = pd.Series(a) s = s[s.str.contains('DRB')] s = s.apply(lambda x:'HLA-'+x.replace('_','*')) return list(s)
[docs]def get_dqp_list(a): """Get DRB list in standard format""" s = pd.Series(a) s = s[s.str.contains('DQ')] s = s.apply(lambda x:x.replace('_','*')) return list(s)
[docs]def get_standard_mhc2(x): return 'HLA'+x.replace('_','*')
[docs]def compare_predictors(p1, p2, by='allele', cutoff=5, n=2): """ Compare predictions from 2 different predictors. Args: p1, p2: predictors with prediction results for the same set of sequences andalleles by: how to group the correlation plots """ import pylab as plt import seaborn as sns a = p1.promiscuous_binders(n=n, cutoff=cutoff) b = p2.promiscuous_binders(n=n, cutoff=cutoff) f = utilities.venndiagram([a.peptide, b.peptide],[p1.name,p2.name],colors=('y','b')) f.suptitle('common\npromiscuous binders n=%s' %n) plt.tight_layout() for p in [p1,p2]: if not 'score' in p.data.columns: p.data['score'] = p.data[p.scorekey] #b1 = p1.get_binders(perc=perc) #b2 = p2.get_binders(perc=perc) #o = analysis.getOverlaps(b1,b2) #merge data for plotting score correlation df = pd.merge(p1.data, p2.data, on=['peptide','allele','name','pos']) g = sns.lmplot(x='score_x', y='score_y', data=df, col=by, ci=None, #robust=True, hue=by, col_wrap=3, markers='o', scatter_kws={'alpha':0.5,'s':30}) g.set_axis_labels(p1.name, p2.name) #g.set(xlim=(0, 1)) #g.set(ylim=(0, 1)) #plotting.plot_tracks([pi,pf],'MAP_0005',n=2,perc=0.97,legend=True,colormap='jet') return
[docs]def reshape_data(pred, peptides=None, name=None, values='score'): """ Create summary table per binder/allele with cutoffs applied. Args: pred: predictor with data cutoff: percentile cutoff n: number of alleles """ df = pred.data idx = ['name','pos','peptide'] if name != None: df = df[df.name==name] idx = ['pos','peptide'] if peptides is not None: df = df[df.peptide.isin(peptides)] p = df.pivot_table(index=idx, columns='allele', values=values) p = p.round(3) #get all in p > cutoff for that allele order = p.mean(1).sort_values() p = p.reindex_axis(order.index, axis=0) return p
[docs]def plot_summary_heatmap(p, kind='default', name=None): """ Plot heatmap of binders using summary dataframe. """ import pylab as plt import seaborn as sns sns.set_context("notebook", font_scale=1.2) plt.rcParams['savefig.dpi']=100 h = int(2 + .2 *len(p)) #print h plt.figure(figsize=(10,h)) ax = sns.heatmap(p, cbar_kws={"shrink": .5}) #g = sns.clustermap(p, figsize=(10,h), standard_scale=1)#, col_cluster=False) #plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0) if name != None: ax.set_title(name) return ax
[docs]def protein_summary(pred, peptides, name): """formatted protein summary table""" x = reshape_data(pred, peptides, name=name) return x.style.background_gradient(cmap='Reds', high=.2).highlight_null('white')
[docs]def summarize_by_protein(pred, pb): """Heatmaps or tables of binders per protein/allele""" for i,g in pb.groupby('name'): x = binders_allele_summary(pred, g.peptide, values='score', name=i) ax=plot_summary_heatmap(x, name=i)
[docs]def split_peptides(df,length=9,seqkey='sequence',newcol='peptide'): """Split sequences in a dataframe into peptide fragments""" res=[] for n,r in df.iterrows(): seq = r[seqkey] p = peptutils.get_fragments(seq,1,length) p = p.rename(columns={'peptide':newcol}) p['name']=n p[seqkey] = seq p=p.set_index('name') res.append(p) res = pd.concat(res) res = df.merge(res,on=seqkey) return res
[docs]def check_snap(): """Check if inside a snap""" if 'SNAP_COMMON' in os.environ: return True return False
[docs]def set_netmhcpan_cmd(path=None): """Setup the netmhcpan command to point directly to the binary. This is a workaround for running inside snaps. Avoids using the tcsh script.""" toolspath = os.path.join('/home', os.environ['USER'], 'tools') netmhcpath = os.path.join(toolspath, 'netMHCpan-4.0/Linux_x86_64') if not os.path.exists(netmhcpath): print ('you need to copy the netmhcpan files to %s' %toolspath) os.environ['NETMHCpan']=netmhcpath os.environ['TMPDIR']= '/tmp' cmd = os.path.join(netmhcpath, 'bin/netMHCpan') #print (os.environ) return cmd
[docs]class DataFrameIterator: """Simple iterator to get dataframes from a path out of memory""" def __init__(self, files): self.files = files self.current = 0 def __iter__(self): return self def __next__(self): if self.current >= len(self.files): raise StopIteration else: df=pd.read_csv(self.files[self.current],index_col=0) self.current += 1 return df
[docs] def next(self): return self.__next__()
def __repr__(self): return 'DataFrameIterator with %s files' %len(self.files)
[docs]class Predictor(object): """Base class to handle generic predictor methods, usually these will wrap methods from other modules and/or call command line predictors. Subclass for specific functionality""" def __init__(self, data=None): self.data = data self.name = '' self.scorekey = 'score' self.operator = '<' self.rankascending = 0 #can specify per allele cutoffs here self.allelecutoffs = None self.temppath = tempfile.mkdtemp() self.path = None return @classmethod def _get_name(self): return 'base class' def __repr__(self): if (self.data is None) or len(self.data) == 0: return '%s predictor' %self.name else: n = len(self.data.name.unique()) return '%s predictor with results in %s sequences' %(self.name, n) def check_install(self): return True
[docs] def supported_lengths(self): """Return supported peptide lengths""" return [9,11]
[docs] def predict(self, sequence=None, peptides=None, length=9, overlap=1, allele='', name=''): """Does the actual scoring of a sequence. Should be overriden. Should return a pandas DataFrame""" return
[docs] def prepare_data(self, result, name, allele): """Put raw prediction data into DataFrame and rank, override for custom processing. Can be overriden for custom data.""" df = pd.DataFrame(result, columns=['peptide','core','pos','score']) df['name'] = name df['allele'] = allele self.get_ranking(df) return df
[docs] def get_ranking(self, df): """Add a ranking column according to scorekey""" s=self.scorekey df['rank'] = df[s].rank(method='min',ascending=self.rankascending) df.sort_values(by=['rank','name','allele'], ascending=True, inplace=True) return df
[docs] def evaluate(self, df, key, value, operator='<'): """ Evaluate binders less than or greater than a cutoff. This method is called by all predictors to get binders """ if operator == '<': return df[df[key] <= value] else: return df[df[key] >= value]
[docs] def get_quantile_data(self): """Get peptide score rank from quantile data.""" path = os.path.join(datadir, 'quantiles_%s.csv' %self.name) qf = pd.read_csv(path,index_col=0) qf.columns = qf.columns.astype('float') return qf
[docs] def get_global_rank(self, score, allele): """Get an allele specific score percentile from precalculated quantile data.""" qf = self.qf s = qf.loc[allele] cutoff = np.abs(s - score).idxmin() return cutoff
[docs] def get_allele_cutoffs(self, cutoff=.95): """Get per allele percentile cutoffs using precalculated quantile vales.""" qf = self.qf cutoff = round(cutoff,2) #if self.rankascending == 1: # cutoff = round(1-cutoff,2) if not cutoff in qf.columns: print ('please use values between 0 and 1 rounded to 2 decimal places e.g. .95 (95% cutoff)') return qf[cutoff]
def _per_allele_binders(self, data, cuts): """Return binders per allele based cutoffs""" value = self.scorekey res=[] for a,g in data.groupby('allele'): if a not in cuts: print ('%s not in cutoffs' %a) continue if self.rankascending == 0: b = g[g[value]>=cuts[a]] else: b = g[g[value]<cuts[a]] res.append(b) return pd.concat(res) def _ranked_binders(self, data, cutoff): """return binders by rank which has been calculated per sequence/allele""" return data[data['rank'] <= cutoff] def _score_binders(self, data, cutoff): """return binders by global single score cutoff""" if self.rankascending == 0: res = data[data[self.scorekey] >= cutoff] else: res = data[data[self.scorekey] <= cutoff] return res
[docs] def get_scores(self, allele): """Return peptides and scores only for an allele""" if self.data is None or len(self.data)==0: return df = self.data[self.data.allele==allele] sc = df[['peptide','score']] return sc
[docs] def get_binders(self, cutoff=.95, cutoff_method='default', path=None, name=None, drop_columns=False, limit=None, **kwargs): """ Get the top scoring binders. If using default cutoffs are derived from the pre-defined percentile cutoffs for some known antigens. For per protein cutoffs the rank can used instead. This will give slightly different results. Args: path: use results in a path instead of loading at once, conserves memory cutoff: percentile cutoff (default), absolute score or a rank value within each sequence cutoff_method: 'default', 'score' or 'rank' name: name of a specific protein/sequence Returns: binders above cutoff in all alleles, pandas dataframe """ if cutoff_method not in ['default', 'global', 'score', 'rank']: print ('choose a valid cutoff method') return cutoff = float(cutoff) data = self.data if cutoff_method in ['default','global','']: #by per allele percentile cutoffs cuts = self.get_allele_cutoffs(cutoff) #print (cuts) if path is not None: #get binders out of memory for large datasets files = get_filenames(path) if files == None: return if name is not None: files = [f for f in files if f.find(name+'.csv')!=-1] d = DataFrameIterator(files) res=[] for data in d: if cutoff_method in ['default','global','']: b = self._per_allele_binders(data, cuts) elif cutoff_method == 'rank': b = self._ranked_binders(data, cutoff) elif cutoff_method == 'score': b = self._score_binders(data, cutoff) res.append(b) res = pd.concat(res) else: if data is None: return if cutoff_method in ['default','global','']: res = self._per_allele_binders(data, cuts) elif cutoff_method == 'rank': res = self._ranked_binders(data, cutoff) elif cutoff_method == 'score': res = self._score_binders(data, cutoff) names = list(res['name'].unique()) if name != None and name in names: res = res[res.name == name] if limit != None: res = res.groupby('name').head(limit) return res
[docs] def promiscuous_binders(self, binders=None, name=None, cutoff=.95, cutoff_method='default', n=1, unique_core=True, limit=None, **kwargs): """ Use params for getbinders if no binders provided? Args: binders: can provide a precalculated list of binders name: specific protein, optional value: to pass to get_binders cutoff_method: 'rank', 'score' or 'global' cutoff: cutoff for get_binders (rank, score or percentile) n: min number of alleles unique_core: removes peptides with duplicate cores and picks the most limit: limit the number of peptides per protein, default None promiscuous and highest ranked, used for mhc-II predictions Returns: a pandas dataframe """ n=int(n) if binders is None: binders = self.get_binders(name=name, cutoff=cutoff, cutoff_method=cutoff_method) if binders is None: print('no binders found, check that prediction data is present') return if 'core' not in binders.columns : binders['core'] = binders.peptide if self.operator == '<': func = min skname = 'min' else: func = max skname = 'max' s = binders.groupby(['peptide','pos','name']).agg({'allele': pd.Series.count, 'core': first, self.scorekey:[func,np.mean], 'rank': np.median}) s.columns = s.columns.get_level_values(1) s.rename(columns={skname: self.scorekey, 'count': 'alleles','median':'median_rank', 'first':'core'}, inplace=True) s = s.reset_index() s = s.sort_values(['alleles','median_rank',self.scorekey], ascending=[False,True,self.rankascending]) #if we just want unique cores, drop duplicates takes most promiscuous in each group #since we have sorted by alleles and median rank if unique_core == True: s = s.drop_duplicates('core') s = s[s.alleles>=n] if limit != None: s = s.groupby('name').head(limit) return s
[docs] def ranked_binders(self, names=None, how='median', cutoff=None): """ Get the median/mean rank of each binder over all alleles. Args: names: list of protein names, otherwise all current data used how: method to use for rank selection, 'median' (default), 'best' or 'mean', cutoff: apply a rank cutoff if we want to filter (optional) """ df = self.data if names != None: if names is str: names = [names] df=df[df.name.isin(names)] funcs = { 'median':np.median, 'mean':np.mean, 'best':min } func = funcs[how] b = df.groupby(['peptide']).agg({'rank': func,'pos':first, 'name':first, self.scorekey: np.median}) #b.columns = b.columns.get_level_values(0) b = b.reset_index().sort_values('rank') if cutoff != None: b = b[b['rank'] < cutoff] return b
[docs] def get_unique_cores(self, binders=False): """Get only unique cores""" if binders == True: df = self.get_binders() else: df = self.data grouped = df.groupby('core') cores = grouped.agg({self.scorekey:max}) #cores = df.loc[grouped[self.scorekey].max().index] cores.sort(self.scorekey, inplace=True, ascending=self.rankascending) #print cores return cores
[docs] def seqs_to_dataframe(self, seqs): df = pd.DataFrame(seqs, columns=['peptide']) df['name'] = df.peptide df['pos'] = range(len(seqs)) return df
def _predict_peptides(self, peptides, alleles=[], verbose=False, drop_columns=False, **kwargs): """ Predict a set of arbitary peptide sequences in a list or dataframe. These are treated as individual peptides and not split into n-mers. This is usually called wrapped by predict_peptides. If the predict method for the class can only accept protein sequences this needs to be overriden. Args: peptides: list of peptides alleles: list of alleles to predict drop_columns: only keep default columns Returns: dataframe with results """ res=[] if len(alleles)==0: print ('no alleles specified') return if type(alleles) is str: alleles = [alleles] #remove empty values #peptides = [i for i in peptides if type(i) is str or type(i) is unicode] peptides = [i for i in peptides if isinstance(i, basestring)] #print (peptides) for a in alleles: df = self.predict(peptides=peptides, allele=a, **kwargs) if df is None or len(df) == 0: continue #if keep_empty == True: #df = s.merge(df,on=['peptide'],how='left') #df['allele'] = df.allele.ffill() #df['pos'] = df.index.copy() res.append(df) if verbose == True and len(df)>0: x = df.iloc[0] r = self.format_row(x) print (r) if len(res) == 0: return data = pd.concat(res) #drop non-standard columns for cleaner output if drop_columns == True: defaultcols = ['pos','peptide',self.scorekey,'allele','rank','name'] data= data[defaultcols] #print (data) self.data = data return data
[docs] def predict_peptides(self, peptides, threads=1, path=None, overwrite=True, name=None, **kwargs): """Predict a set of individual peptides without splitting them. This is a wrapper for _predict_peptides to allow multiprocessing. Args: peptides: list of peptides alleles: list of alleles to predict drop_columns: only keep default columns Returns: dataframe with results """ if path is not None: fname = os.path.join(path, 'results_%s.csv' %self.name) if overwrite == False and os.path.exists(fname): #load the data if present and not overwriting print ('results file found, not overwriting') self.path = fname return #store original peptide list as dataframe to keep order s = pd.DataFrame(peptides, columns=['peptide']) s['pos'] = s.index.copy() if threads == 1: data = self._predict_peptides(peptides, **kwargs) else: data = self._run_multiprocess(peptides, worker=predict_peptides_worker, threads=threads, **kwargs) if data is None or len(data) == 0: print ('empty result returned') return #get original peptide positions by re-merging with original peptide dataframe #also allows us to keep empty rows in correct order #make this optional as it could cause issues? new = [] for i,g in data.groupby('allele'): #print (len(g)) #print (s) #print (g) if 'pos' in g.columns: g=g.drop('pos',1) x = s.merge(g,on=['peptide'],how='left').drop_duplicates(['pos','peptide']) #x['pos'] = x.index.copy() x['allele'] = x.allele.fillna(i) new.append(x) #print (len(x)) #print (x) data = pd.concat(new).reset_index(drop=True) data = data.groupby('allele').apply(self.get_ranking) #move this to loop data = data.reset_index(drop=True) if name==None: name='temp' if 'name' not in data.columns: data['name'] = name #print (data) if path is not None: data.to_csv(fname, float_format='%g') self.path = fname else: self.data = data return data
def _convert_to_dataframe(self, recs): """convert sequence list input to dataframe""" if type(recs) is str: recs = [recs] if type(recs) is list: idx = [str(i) for i in range(len(recs))] recs = pd.DataFrame(zip(idx,recs), columns=['locus_tag','translation']) return recs
[docs] def predict_proteins(self, args, **kwargs): """Alias to predict_sequences""" results = self.predict_sequences(args, **kwargs) return results
[docs] def predict_sequences(self, recs, alleles=[], path=None, verbose=False, names=None, key='locus_tag', seqkey='translation', threads=1, **kwargs): """ Get predictions for a set of proteins over multiple alleles that allows running in parallel using the threads parameter. This is a wrapper for _predictSequences with the same args. Args: recs: list or dataframe with sequences path: if provided, save results to this file threads: number of processors key: seq/protein name key seqkey: key for sequence column length: length of peptide to split sequence into Returns: a dataframe of predictions over multiple proteins """ recs = self._convert_to_dataframe(recs) if names is not None: recs = recs[recs[key].isin(names)] if len(recs) == 0: print ('no records to predict') return if verbose == True: self.print_heading() #print (threads) if threads == 1: results = self._predict_sequences(recs, alleles=alleles, path=path, verbose=verbose, **kwargs) else: results = self._run_multiprocess(recs, alleles=alleles, path=path, verbose=verbose, threads=threads, **kwargs) print ('predictions done for %s sequences in %s alleles' %(len(recs),len(alleles))) if path is None: #if no path we assign results to the data attribute self.data = results else: print ('results saved to %s' %os.path.abspath(path)) results = None self.cleanup() return results
def _predict_sequences(self, recs, path=None, overwrite=True, alleles=[], length=11, overlap=1, verbose=False, compression=None, **kwargs): """ Get predictions for a set of proteins and/or over multiple alleles. Sequences should be put into a dataframe and supplied to this method as the first argument. Args: recs: protein sequences in a pandas DataFrame path: if results are to be saved to disk provide a path overwrite: over write existing protein files in path if present alleles: allele list length: length of peptides to predict overlap: overlap of n-mers verbose: provide output per protein/sequence Returns: a dataframe of the results if no path is given """ if type(alleles) is str: alleles = [alleles] elif type(alleles) is pd.Series: alleles = alleles.tolist() #self.check_alleles(alleles) if len(alleles) == 0: print ('no alleles provided') return #recs = self._convert_to_dataframe(recs) results = [] if path is not None and path != '': if not os.path.exists(path): try: os.mkdir(path) except (OSError, FileExistsError): pass results = [] self.length = length if compression == '': compression = None for i,row in recs.iterrows(): seq = row.translation seq = clean_sequence(seq) #clean the sequence of non-aa characters #print (row) name = row.locus_tag if path is not None: fname = os.path.join(path, name+'.csv') if os.path.exists(fname) and overwrite == False: continue res = [] peptides, s = peptutils.create_fragments(seq=seq, length=length, overlap=overlap) for a in alleles: #we pass sequence, length, overlap for predictors that have to run on whole seq df = self.predict(peptides=peptides, sequence=seq, allele=a, name=name, length=length, overlap=overlap, **kwargs) if df is not None: res.append(df) else: continue if verbose == True and len(df)>0: x = df.iloc[0] s = self.format_row(x) print (s) if len(res) == 0: continue res = pd.concat(res, sort=True) if path is not None and len(res)>0: if compression == 'gzip': fname = fname+'.gz' res.to_csv(fname, compression=compression) else: results.append(res) #print (len(recs)) if len(results)>0: results = pd.concat(results) return results
[docs] def print_heading(self): s = ("{:<25} {:<16} {:<18} {:<}" .format('name','allele','top peptide','score')) print (s) return
[docs] def format_row(self, x): s = ("{:<25} {:<16} {:<18} {:} " .format(x['name'], x.allele, x.peptide, x[self.scorekey] )) return s
def _run_multiprocess(self, recs, threads=2, worker=None, **kwargs): """ Call function with multiprocessing pools. Used for running predictions in parallel where the main input is a pandas dataframe. Args: recs: input dataframe threads: number of cores to use worker: function to be run in parallel Returns: concatenated result, a pandas dataframe """ if worker is None: worker = predict_proteins_worker #print (worker) import multiprocessing as mp maxcpu = mp.cpu_count() if threads == 0 or threads > maxcpu: threads = maxcpu if threads >= len(recs): threads = len(recs) pool = mp.Pool(threads) funclist = [] st = time.time() chunks = np.array_split(recs,threads) #print ([len(i) for i in chunks]) for recs in chunks: f = pool.apply_async(worker, [self,recs,kwargs]) funclist.append(f) result = [] try: for f in funclist: df = f.get(timeout=None) #print (df) if df is not None and len(df)>0: result.append(df) except KeyboardInterrupt: print ('process interrupted') pool.terminate() sys.exit(0) pool.close() pool.join() #print (result) if len(result)>0: result = pd.concat(result) #print result.info() t=time.time()-st if t>10: print ('took %s seconds' %str(round(t,3))) return result
[docs] def load(self, path=None, names=None, compression='infer', file_limit=None): """Load results from path or single file. See results_from_csv for args.""" if path is None and self.path != None: #if path attribute is set no path arg we use that path = self.path data = results_from_csv(path, names, compression, file_limit) if data is None: print ('no data found in path %s' %path) else: self.data = data if not self.scorekey in data.columns: print ('WARNING. this does not look like the correct data for this predictor') return
[docs] def save(self, prefix='_', filename=None, compression=None): """ Save all current predictions dataframe with some metadata Args: prefix: if writing to a path, the prefix name filename: if saving all to a single file compression: a string representing the compression to use, allowed values are 'gzip', 'bz2', 'xz'. """ exts = {'gzip':'.gz','bz2':'.bz2','xz':'.xz'} if filename != None: cext = exts[compression] if compression != None and not filename.endswith(cext): filename += cext self.data.to_csv(filename, compression=compression) else: #save one file per protein/name ext = '.csv' path = os.path.join(prefix, self.name) print ('saving to %s' %path) if not os.path.exists(path): os.makedirs(path) for name,df in self.data.groupby('name'): outfile = os.path.join(path, name+ext) df.to_csv(outfile) return
[docs] def save_msgpack(self, filename=None): """Save as msgpack format - experimental""" if filename == None: filename = 'epit_%s_%s_%s.msg' %(label,self.name,self.length) print ('saving as %s' %filename) meta = {'method':self.name, 'length':self.length} pd.to_msgpack(filename, meta) for i,g in self.data.groupby('name'): pd.to_msgpack(filename, g, append=True) return
[docs] def summarize(self): """Summarise currently loaded data""" return summarize(self.data)
[docs] def allele_summary(self, cutoff=5): """Allele based summary""" b = self.get_binders(cutoff=cutoff) s = b.groupby('allele').agg({'peptide':np.size,self.scorekey:[np.median,np.mean]}) s.columns = s.columns.get_level_values(1) return s
[docs] def protein_summary(self): print ( self.data.groupby('name').agg({'peptide':np.size}) )
[docs] def proteins(self): return list(self.data.name.unique())
[docs] def reshape(self, name=None): """Return pivoted data over alleles for summary use""" df = self.data if name != None: df = df[df.name==name] p = df.pivot_table(index='peptide', columns='allele', values=self.scorekey) p = p.reset_index() x = list(df.groupby('allele'))[0][1] p = p.merge(x[['pos','peptide']],on='peptide') p['mean'] = p.mean(1) p=p.sort('mean',ascending=self.rankascending) return p
[docs] def get_names(self): """Get names of sequences currently stored as predictions""" grp = self.data.groupby('name') return sorted(dict(list(grp)).keys())
[docs] def plot(self, name, **kwargs): """ Use module level plotting.mpl_plot_tracks method for predictor plot Args: name: n: min no. of alleles to be visible perc: percentile cutoff for score cutoff_method: method to use for cutoffs """ from . import plotting if name == None: return plot = plotting.plot_tracks([self], name=name, **kwargs) return plot
[docs] def get_alleles(self): """Get available alleles - override""" return []
[docs] def check_alleles(self, alleles): a = self.get_alleles() #print (a) found = list((set(a) & set(alleles))) return found
[docs] def cleanup(self): """Remove temp files from predictions""" if os.path.exists(self.temppath): shutil.rmtree(self.temppath) return
[docs] def check_install(self): return True
[docs]class NetMHCPanPredictor(Predictor): """netMHCpan 4.1b predictor see http://www.cbs.dtu.dk/services/NetMHCpan/ Default scoring is affinity predictions. To get newer scoring behaviour pass scoring='ligand' to constructor. """ def __init__(self, data=None, scoring='affinity'): Predictor.__init__(self, data=data) self.name = 'netmhcpan' self.colnames = ['pos','allele','peptide','core','Of','Gp','Gl','Ip','Il','Icore', 'name','raw_score','ic50','rank'] self.scorekey = 'score' self.scoring = scoring if self.scoring =='affinity': self.cutoff = 500 self.operator = '<' self.rankascending = 1 elif scoring == 'ligand': self.cutoff = 0.5 self.operator = '>' self.rankascending = 0 #load precalculated quantiles for sample peptides self.qf = self.get_quantile_data() self.basecmd = 'netMHCpan' #base command needs to be to the binary directly if running snap if check_snap() is True: self.basecmd = set_netmhcpan_cmd()
[docs] def read_result(self, temp): """Read raw results from netMHCpan 4.1b output""" ignore = ['Pos','#','Protein',''] cols = ['pos','allele','peptide','core','Of','Gp','Gl','Ip','Il','Icore', 'Identity','Score_EL','%Rank_EL','Score_BA','%Rank_BA','ic50','BindLevel'] res = pd.read_csv(io.BytesIO(temp), comment='#', names=cols, sep='\s+', error_bad_lines=False, skiprows=46).dropna(subset=['peptide']) res = res[~res.pos.isin(ignore)] #print (res) return res
[docs] def read_result_legacy(self, temp): """Read raw results from netMHCpan 4.0 output""" ignore = ['Pos','#','Protein',''] if self.scoring == 'affinity': cols = ['pos','allele','peptide','core','Of','Gp','Gl','Ip','Il','Icore', 'Identity','Score','ic50','%Rank','X','BindLevel'] else: cols = ['pos','allele','peptide','core','Of','Gp','Gl','Ip','Il','Icore', 'Identity','Score','%Rank','X','BindLevel'] res = pd.read_csv(io.BytesIO(temp), comment='#', names=cols, sep='\s+', error_bad_lines=False, skiprows=46).dropna(subset=['peptide']) res = res.drop(columns='X') res = res[~res.pos.isin(ignore)] #print (res) return res
[docs] def prepare_data(self, df, name): """Prepare netmhcpan results""" df = df.apply( lambda x: pd.to_numeric(x, errors='ignore').dropna()) df['name'] = name if self.scoring =='affinity': df['score'] = df.ic50 else: df['score'] = df.Score_EL df=df.reset_index(drop=True) df['pos'] = df.index.copy()+1 df = df.dropna(how='all') self.get_ranking(df) self.data = df return
[docs] def predict(self, peptides, allele='HLA-A*01:01', name='temp', pseudosequence=None, show_cmd=False, **kwargs): """Call netMHCpan command line. """ allele = allele.replace('*','') #write peptides to temp file pepfile = tempfile.mktemp()+'.pep' with open(pepfile ,'w') as f: for p in peptides: f.write(p+'\n') f.close() if self.scoring =='affinity': cmd = '%s -BA -f %s -inptype 1 -a %s' %(self.basecmd, pepfile , allele) else: cmd = '%s -f %s -inptype 1 -a %s' %(self.basecmd, pepfile , allele) if show_cmd is True: print (cmd) try: temp = subprocess.check_output(cmd, shell=True, executable='/bin/bash') except Exception as e: #print (e) return res = self.read_result(temp) if len(res)==0: print ('empty result, check allele %s is correct' %allele) return res self.prepare_data(res, name=name) return self.data
[docs] def get_alleles(self): """Get available alleles""" cmd = 'netMHCpan -listMHC' try: temp = subprocess.check_output(cmd, shell=True, executable='/bin/bash') except: print('netmhcpan not installed?') return [] s = temp.decode().split('\n') alleles = [i for i in s if not i.startswith('#') and len(i)>0] alleles = [self.convert_allele_name(i) for i in alleles] return alleles
[docs] def convert_allele_name(self, a): """Convert allele names to internally used form""" if not ':' in a and a.startswith('HLA'): a = a[:-2]+':'+a[-2:] return a
[docs] def check_install(self): cmd = '%s -h' %self.basecmd try: temp = subprocess.check_output(cmd, shell=True) print('netMHCpan appears to be installed') return 1 except: return 0
@classmethod def _get_name(self): return 'netmhcpan'
[docs]class NetMHCIIPanPredictor(Predictor): """netMHCIIpan v3.0 predictor""" def __init__(self, data=None): Predictor.__init__(self, data=data) self.name = 'netmhciipan' self.colnames = ['pos','HLA','peptide','Identity','Pos','Core', '1-log50k(aff)','Affinity','Rank'] self.scorekey = 'score' self.cutoff = 500 #.426 self.operator = '<' self.rankascending = 1 #load precalculated quantiles for sample peptides self.qf = self.get_quantile_data()
[docs] def read_result(self, temp): """Read raw results from netMHCIIpan output""" ignore = ['Pos','#','Protein',''] cols = self.colnames res = pd.read_csv(io.BytesIO(temp), comment='#', names=cols, sep='\s+', error_bad_lines=False, skiprows=16).dropna(subset=['peptide']) res = res[~res.pos.isin(ignore)] return res
[docs] def prepare_data(self, df, name): """Prepare netmhciipan results as a dataframe""" #df = df.convert_objects(convert_numeric=True) df = df.apply( lambda x: pd.to_numeric(x, errors='ignore').dropna()) df['name'] = name df.rename(columns={'Core': 'core','HLA':'allele'}, inplace=True) df = df.drop(['Pos','Identity','Rank'],1) df['allele'] = df.allele.apply( lambda x: self.convert_allele_name(x) ) df['score'] = pd.to_numeric(df.Affinity,errors='coerce') df['pos'] = df.index.copy() df = df.dropna() self.get_ranking(df) self.data = df return
[docs] def predict(self, peptides, allele='HLA-DRB1*0101', name='temp', pseudosequence=None, show_cmd=False, **kwargs): """Call netMHCIIpan command line.""" #assume allele names are in standard format HLA-DRB1*0101 #if allele.startswith('HLA-'): # allele=allele[4:] allele = self.allele_mapping(allele) #print (allele) #write peptides to temp file pepfile = tempfile.mktemp()+'.pep' with open(pepfile ,'w') as f: for p in peptides: f.write(p+'\n') f.close() cmd = 'netMHCIIpan -f %s -inptype 1 -a %s' %(pepfile , allele) if show_cmd is True: print (cmd) try: temp = subprocess.check_output(cmd, shell=True, executable='/bin/bash') except Exception as e: print (e) return rows = self.read_result(temp) res = pd.DataFrame(rows) if len(res)==0: return res self.prepare_data(res, name) return self.data
[docs] def allele_mapping(self, allele): if allele.startswith('HLA-DQA'): return allele.replace('*','').replace(':','') else: return allele.replace('*','_').replace(':','').replace('HLA-DRB','DRB')
[docs] def get_alleles(self): """Get available alleles""" cmd = 'netMHCIIpan -list' try: temp = subprocess.check_output(cmd, shell=True, executable='/bin/bash') except: print('netmhciipan not installed?') return [] alleles = temp.decode().split('\n')#[34:] return alleles
[docs] def convert_allele_name(self, a): """Convert allele names to internally used form""" if not a.startswith('HLA'): return 'HLA-'+a.replace(':','') else: return a.replace(':','')
[docs] def check_install(self): cmd = 'netMHCIIpan' try: temp = subprocess.check_output(cmd, shell=True) print('netMHCIIpan appears to be installed') return 1 except: print ('netMHCIIpan not found') return 0
@classmethod def _get_name(self): return 'netmhciipan'
[docs]class IEDBMHCIPredictor(Predictor): """Using IEDB tools method, requires iedb-mhc1 tools. Tested with version 2.17""" def __init__(self, data=None, method='IEDB_recommended'): Predictor.__init__(self, data=data) self.name = 'iedbmhc1' self.methods = ['ann', 'IEDB_recommended', 'comblib_sidney2008', 'consensus', 'smm', 'netmhcpan', 'smmpmbec'] self.scorekey = 'ic50' self.cutoff = 500 self.operator = '<' self.rankascending = 1 self.method = method self.iedbmhc1_path = defaults['iedbmhc1_path'] self.qf = self.get_quantile_data() return
[docs] def predict(self, sequence=None, peptides=None, length=11, overlap=1, allele='HLA-A*01:01', name='', method=None, show_cmd=False, **kwargs): """Use IEDB MHCI python module to get predictions. Requires that the IEDB MHC tools are installed locally Args: sequence: a sequence to be predicted peptides: a list of arbitrary peptides instead of single sequence Returns: pandas dataframe """ if sequence is not None: tempf = os.path.join(self.temppath, name+'.fa') seqfile = write_fasta(sequence, id=name, filename=tempf) elif peptides is not None: if type(peptides) is not pd.DataFrame: peptides = self.seqs_to_dataframe(peptides) #print (peptides[:3]) tempf = tempfile.mktemp()+'.txt' seqfile = sequtils.dataframe_to_fasta(peptides, outfile=tempf, seqkey='peptide', idkey='name') length = peptides.peptide.str.len().max() else: return if not os.path.exists(self.iedbmhc1_path): print ('IEDB MHC-I tools not found, set the iedbmhc1path variable') return if method == None: method = self.method if method not in self.methods: print ('available methods are %s' %self.methods) return self.method = method cmd = os.path.join(self.iedbmhc1_path,'src/predict_binding.py') cmd = cmd+' %s %s %s %s' %(method,allele,length,seqfile) if show_cmd == True: print (cmd) from subprocess import Popen, PIPE try: p = Popen(cmd, stdout=PIPE, shell=True) temp,error = p.communicate() #print (temp) except OSError as e: print (e) return #print (temp) df = self.prepare_data(temp, name) return df
[docs] def prepare_data(self, rows, name): """Prepare data from results""" try: df = pd.read_csv(io.BytesIO(rows),sep="\t") except: #print ('no output to read') return #print (df) if len(df)==0: print (rows) #should print error string from output return df = df.replace('-',np.nan) df = df.dropna(axis=1,how='all') df.rename(columns={'start':'pos'}, inplace=True) df=df.drop(columns=['seq_num','end']) df['core'] = df.peptide df['name'] = name if 'method' not in df.columns: df['method'] = self.method if self.method in ['IEDB_recommended','consensus']: df['ic50'] = df.filter(regex="ic50").mean(1) if not 'score' in df.columns: df['score'] = df.ic50#.apply( lambda x: 1-math.log(x, 50000)) self.get_ranking(df) self.data = df #print (df[:10]) return df
[docs] def get_alleles(self): """Get available alleles from model_list file and convert to standard names""" alleles = list(pd.read_csv(os.path.join(datadir, 'iedb_mhc1_alleles.csv')).allele.values) #alleles = sorted(list(set([get_standard_mhc1(i) for i in alleles]))) return alleles
[docs] def get_allele_data(self): path = self.iedbmhc1_path if not os.path.exists(path): print ('iedb tools not found') return c = os.path.join(path,'src/predict_binding.py') for m in self.methods: cmd = c + ' %s mhc' %m temp = subprocess.check_output(cmd, shell=True, executable='/bin/bash') print (temp) return
[docs] def check_install(self): path = self.iedbmhc1_path if not os.path.exists(path): print ('iedb mhcII tools not found') return cmd = os.path.join(path,'src/predict_binding.py') try: temp = subprocess.check_output(cmd, shell=True) print('IEDB MHC-I tools appears to be installed') return 1 except: return 0
@classmethod def _get_name(self): return 'iedbmhc1'
[docs]class IEDBMHCIIPredictor(Predictor): """Using IEDB MHC-II method, requires tools to be installed locally""" def __init__(self, data=None): Predictor.__init__(self, data=data) self.name = 'iedbmhc2' #score will store ic50 value for most methods except sturniolo self.scorekey = 'score' self.cutoff = 500 self.operator = '<' self.rankascending = 1 self.methods = ['comblib','consensus3','IEDB_recommended', 'NetMHCIIpan','nn_align','smm_align','sturniolo'] self.method = 'IEDB_recommended' self.iedbmhc2_path = defaults['iedbmhc2_path'] self.qf = self.get_quantile_data() return
[docs] def prepare_data(self, rows, name): """Read data from raw output""" if len(rows) == 0: return df = pd.read_csv(io.BytesIO(rows),sep=r'\t',engine='python',index_col=False) #print (df.columns) extracols = ['Start','End','comblib_percentile','smm_percentile','nn_percentile', 'Sturniolo core',' Sturniolo score',' Sturniolo percentile'] df.reset_index(inplace=True) df.rename(columns={'index':'pos','Sequence': 'peptide','Allele':'allele'}, inplace=True) df['core'] = df[df.filter(regex="core").columns[0]] df['name'] = name if 'method' not in df.columns: df['method'] = self.method #print (df.loc[0]) method = df.loc[0].method if method == 'Sturniolo': df['score'] = df.sturniolo_score else: df['ic50'] = df.filter(regex="ic50").mean(1) if not 'score' in df.columns: df['score'] = df.ic50#.apply( lambda x: 1-math.log(x, 50000)) self.get_ranking(df) self.data = df return df
[docs] def predict(self, sequence=None, peptides=None, length=15, overlap=None, show_cmd=False, allele='HLA-DRB1*01:01', method='IEDB_recommended', name='', **kwargs): """Use IEDB MHC-II python module to get predictions. Requires that the IEDB MHC-II tools are installed locally. A sequence argument is provided since the cmd line only accepts whole sequence to be fragmented. """ self.method = method if sequence is not None: seqfile = tempfile.mktemp()+'.fa' write_fasta(sequence, id=name, filename=seqfile) elif peptides is not None: if type(peptides) is not pd.DataFrame: peptides = self.seqs_to_dataframe(peptides) #print (peptides[:3]) tempf = tempfile.mktemp()+'.txt' seqfile = sequtils.dataframe_to_fasta(peptides, outfile=tempf, seqkey='peptide', idkey='name') length = peptides.peptide.str.len().max() else: return path = self.iedbmhc2_path if method == None: method = 'IEDB_recommended' if not os.path.exists(path): print ('iedb MHC-II tools not found') return cmd = os.path.join(path,'mhc_II_binding.py') cmd = cmd+' %s %s %s' %(method,allele,seqfile) if show_cmd is True: print (cmd) try: #temp = subprocess.check_output(cmd, shell=True) from subprocess import Popen, PIPE, STDOUT p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True) temp = p.stdout.read() #print (temp) except Exception as e: print (e) print ('check allele %s is correct.' %allele) return data = self.prepare_data(temp, name) return data
[docs] def get_alleles(self): alleles = list(pd.read_csv(os.path.join(datadir, 'iedb_mhc2_alleles.csv')).allele.values) return alleles
[docs] def check_install(self): path = self.iedbmhc2_path if not os.path.exists(path): print ('iedb mhcII tools not found') return cmd = os.path.join(path,'mhc_II_binding.py') try: temp = subprocess.check_output(cmd, shell=True) print('IEDB MHC-II tools appears to be installed') return 1 except: return 0
@classmethod def _get_name(self): return 'iedbmhc2'
[docs]class TEpitopePredictor(Predictor): """Predictor using TepitopePan QM method""" def __init__(self, data=None, **kwargs): Predictor.__init__(self, data=data) self.name = 'tepitope' self.pssms = tepitope.get_pssms() self.cutoff = 2 self.operator = '>' self.rankascending = 0 #load precalculated quantiles for sample peptides self.qf = self.get_quantile_data()
[docs] def predict(self, peptides=None, length=9, overlap=1, allele='HLA-DRB1*0101', name='', pseudosequence=None, **kwargs): if length not in [9,11,12,15]: print ('you should use lengths of 9 or 11, 13 or 15.') allele = allele.replace(':','') if not allele in self.pssms: #print 'computing virtual matrix for %s' %allele m = tepitope.create_virtual_pssm(allele) if m is None: print ('no such allele', allele) return pd.DataFrame() else: m = self.pssms[allele] m = m.transpose().to_dict() result = tepitope.get_scores(m, peptides=peptides) df = self.prepare_data(result, name, allele) self.data = df #print(df[:5]) return df
[docs] def supported_lengths(self): """Return supported peptide lengths""" return [9,11,13,15]
[docs] def get_alleles(self): return tepitope.get_alleles()
[docs] def check_alleles(self, alleles): alleles = [i.replace(':','') for i in alleles] a = self.get_alleles() found = list((set(a) & set(alleles))) return found
@classmethod def _get_name(self): return 'tepitope'
'''class IEDBBCellPredictor(Predictor): """Using IEDB tools methods, requires iedb bcell tools. see http://tools.immuneepitope.org/bcell """ def __init__(self, data=None): Predictor.__init__(self, data=data) self.name = 'iedbbcell' self.scorekey = 'Score' self.methods = ['Chou-Fasman', 'Emini', 'Karplus-Schulz', 'Kolaskar-Tongaonkar', 'Parker', 'Bepipred'] self.cutoff = 0.9 self.operator = '>' self.rankascending = 0 self.method = 'Bepipred' self.path = iedbbcellpath def predict(self, sequence=None, peptides=None, window=None, name=''): """Uses code from iedb predict_binding.py """ value = self.method currpath=os.getcwd() os.chdir(self.path) sys.path.append(self.path) from src.BCell import BCell bcell = BCell() filepath = os.path.join(self.path,'bcell_scales.pickle') picklefile = open(filepath, 'rb') scale_dict = pickle.load(picklefile) bcell.scale_dict = scale_dict[value] if window==None: window = bcell.window center = "%d" %round(int(window)/2.0) if value == 'Emini': results = bcell.emini_method(value, sequence, window, center) elif value == 'Karplus-Schulz': results = bcell.karplusshulz_method(value, sequence, window, center) elif value == 'Kolaskar-Tongaonkar': results = bcell.kolaskartongaonkar_method(value, sequence, window, center) elif value == 'Bepipred': results = bcell.bepipred_method(value, sequence, window, center) else: results = bcell.classical_method(value, sequence, window, center) threshold = round(results[1][0], 3) temp=results[0] self.prepare_data(temp, name) os.chdir(currpath) return self.data def prepare_data(self, temp, name): df = pd.read_csv(temp,sep=",") if len(df)==0: return #df = df.replace('-',np.nan) df = df.dropna(axis=1,how='all') #df.reset_index(inplace=True) df['name'] = name self.data = df #print (df) return def predict_sequences(self, recs, names=None, save=False, label='', path='', **kwargs): """Get predictions for a set of proteins - no alleles so we override the base method for this too. """ recs = sequtils.get_cds(recs) if names != None: recs = recs[recs.locus_tag.isin(names)] proteins = list(recs.iterrows()) res=[] for i,row in proteins: seq = row['translation'] name = row['locus_tag'] #print (name) df = self.predict(sequence=seq,name=name) res.append(df) if save == True: #fname = os.path.join(path, name+'.mpk') #pd.to_msgpack(fname, res) fname = os.path.join(path, name+'.csv') df.to_csv(fname) self.data = res = pd.concat(res) return'''
[docs]class MHCFlurryPredictor(Predictor): """ Predictor using MHCFlurry for MHC-I predictions. Requires you to install the python package mhcflurry with dependencies. see https://github.com/hammerlab/mhcflurry """ def __init__(self, data=None, **kwargs): Predictor.__init__(self, data=data) self.name = 'mhcflurry' self.cutoff = 500 self.operator = '<' self.scorekey = 'score' self.rankascending = 1 if self.check_install() == False: return self._check_models() #load precalculated quantiles for sample peptides self.qf = self.get_quantile_data() return
[docs] def predict(self, peptides=None, overlap=1, show_cmd=False, allele='HLA-A0101', name='', **kwargs): """Uses mhcflurry python classes for prediction""" p =' '.join(peptides) tmp = tempfile.mkstemp()[1] try: cmd = 'mhcflurry-predict --alleles {a} --peptides {p} --out {t}'.format(a=allele,p=p,t=tmp) if show_cmd==True: print (cmd) subprocess.check_output(cmd, shell=True) except Exception as e: print ('failed to predict for allele %s' %allele) print (e) return df = pd.read_csv(tmp) df['score'] = df.mhcflurry_affinity df['pos'] = df.index+1 df['name'] = name self.get_ranking(df) self.data = df os.remove(tmp) return df
[docs] def convert_allele_name(self, r): if ':' not in r: return r[:5]+'*'+r[5:7]+':'+r[7:] else: return r[:5]+'*'+r[5:7]+r[7:]
[docs] def get_alleles(self): with open(os.path.join(datadir,'mhcflurry_alleles.txt')) as f: p = f.readlines() p = [x.strip() for x in p] p = list(filter(None, p)) return p
[docs] def check_install(self): try: from mhcflurry import Class1AffinityPredictor return True except: #print ('use pip install mhcflurry to install.') return False
def _check_models(self): try: import mhcflurry mhcflurry.class1_affinity_predictor.get_default_class1_models_dir() except: print ('first use. downloading class-I models') subprocess.check_output('mhcflurry-downloads fetch models_class1', shell=True) return True return False
[docs] def predict_sequences(self, recs, **kwargs): """Override so we can switch off multi threading.""" kwargs['threads'] = 1 results = Predictor.predict_sequences(self, recs, **kwargs) return results
[docs] def predict_peptides(self, peptides, **kwargs): """Override so we can call train models before predictions.""" kwargs['threads'] = 1 data = Predictor.predict_peptides(self, peptides, **kwargs) return data
@classmethod def _get_name(self): return 'mhcflurry'
'''class MHCNuggetsPredictor(Predictor): """ Predictor using MHCNuggets for MHC-I predictions. Requires you to install the package locally from https://github.com/KarchinLab/mhcnuggets see https://www.biorxiv.org/content/early/2017/07/27/154757 """ def __init__(self, data=None): Predictor.__init__(self, data=data) self.name = 'mhcnuggets' self.cutoff = 500 self.operator = '<' self.scorekey = 'ic50' self.rankascending = 1 self.qf = self.get_quantile_data() return def predict(self, peptides=None, overlap=1, allele='HLA-A01:01', name='temp', **kwargs): """Uses cmd line call to mhcnuggets.""" allele = allele.replace('*','') from mhcnuggets.src.predict import predict pepfile = self.write_seqs(peptides) outfile = tempfile.mktemp()+'.txt' #print(outfile) predict(class_='I', peptides_path=pepfile, mhc=allele, output=outfile) res = pd.read_csv(outfile) df = self.prepare_data(res, name, allele) return df def prepare_data(self, df, name, allele): """Get result into dataframe""" df['name'] = name df['allele'] = allele df['pos'] = df.index.copy() self.get_ranking(df) self.data = df return df def write_seqs(self, peptides): tempf = tempfile.mktemp()+'.pep' f = open(tempf,'w') for p in peptides: f.write("%s\n" % p) f.close() return tempf def get_alleles(self): with open(os.path.join(datadir,'mhcnuggets_alleles.txt')) as f: p = f.readlines() p = [x.strip() for x in p] p = list(filter(None, p)) return p @classmethod def _get_name(self): return 'mhcnuggets' '''
[docs]class DummyPredictor(Predictor): """Returns random scores. Used for testing""" def __init__(self, data=None, scoring=None): Predictor.__init__(self, data=data) self.name = 'null' self.scorekey = 'score' self.cutoff = 500 self.operator = '<' self.rankascending = 1
[docs] def predict(self, peptides, allele='HLA-A*01:01', name='temp', **kwargs): sc = np.random.randint(0,1,len(peptides)) res = pd.DataFrame(np.column_stack([peptides,sc]),columns=['peptide','log50k']) res['score'] = res.log50k.astype('float').apply(lambda x: log50k2aff(x)) df = self.prepare_data(res,name,allele) return df
@classmethod def _get_name(self): return 'dummy'
[docs]class BasicMHCIPredictor(Predictor): """Built-in basic MHC-I predictor. Should be used as a fallback if no other predictors available.""" def __init__(self, data=None, scoring=None): Predictor.__init__(self, data=data) self.name = 'basicmhc1' self.scorekey = 'score' self.cutoff = 500 self.operator = '<' self.rankascending = 1 self.qf = self.get_quantile_data() self.encoder = mhclearn.one_hot_encode return
[docs] def predict(self, peptides, allele='HLA-A*01:01', name='temp', **kwargs): """Encode and predict peptides with saved regressor""" length = len(peptides[0]) if length <9 and length>11: return encoder = self.encoder if type(peptides) is list: s = pd.Series(peptides) else: s = peptides #encode X = s.apply(lambda x: pd.Series(encoder(x)),1) reg = mhclearn.get_model(allele, length) if reg is None: print ('no model for this allele') return sc = reg.predict(X).round(4) res = pd.DataFrame(np.column_stack([peptides,sc]),columns=['peptide','log50k']) res['log50k'] = res.log50k.astype('float') res['score'] = res.log50k.apply(lambda x: mhclearn.log50k2aff(x)).round(2) #print(res.dtypes) df = self.prepare_data(res,name,allele) return df
[docs] def prepare_data(self, df, name, allele): """Put raw prediction data into DataFrame and rank, override for custom processing. Can be overriden for custom data.""" df['name'] = name df['pos'] = df.index+1 df['allele'] = allele self.get_ranking(df) return df
[docs] def supported_lengths(self): """Return supported peptide lengths""" return [9,10,11]
[docs] def get_alleles(self): p = mhclearn.get_allele_names() return p
[docs] def check_install(self): reg = mhclearn.build_predictor('HLA-A*01:01') if reg is None: return False return True
[docs] def predict_peptides(self, peptides, **kwargs): """Override so we can call train models before predictions.""" mhclearn.train_models(alleles=kwargs['alleles'], encoder=self.encoder) data = Predictor.predict_peptides(self, peptides, **kwargs) return data
[docs] def predict_sequences(self, recs, **kwargs): """Override so we can call train models before predictions.""" mhclearn.train_models(alleles=kwargs['alleles'], encoder=self.encoder) results = Predictor.predict_sequences(self, recs, **kwargs) return results
@classmethod def _get_name(self): return 'basicmhc1'