Source code for epitopepredict.base

#!/usr/bin/env python

"""
    MHC prediction base module for core classes
    Created November 2013
    Copyright (C) Damien Farrell
    This program is free software; you can redistribute it and/or
    modify it under the terms of the GNU General Public License
    as published by the Free Software Foundation; either version 3
    of the License, or (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

"""

from __future__ import absolute_import, print_function
import sys, os, shutil, string
import csv, glob, pickle, tempfile
import time, io
import operator as op
import re, types
try:
    unicode = unicode
except NameError:
    #unicode is undefined, must be Python 3
    str = str
    unicode = str
    basestring = (str,bytes)
import math
import subprocess
from subprocess import CalledProcessError
import numpy as np
import pandas as pd
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from . import utilities, peptutils, sequtils, tepitope, config, mhclearn

#write a default config if not present
config_file = config.write_default_config()
home = os.path.expanduser("~")
config_path = os.path.join(home, '.config/epitopepredict')
module_path = os.path.dirname(os.path.abspath(__file__)) #path to module
datadir = os.path.join(module_path, 'mhcdata')
predictors = ['basicmhc1','tepitope','netmhciipan','netmhcpan','mhcflurry','iedbmhc1','iedbmhc2']
iedbmhc1_methods = ['ann', 'IEDB_recommended', 'comblib_sidney2008', 'consensus', 'smm', 'smmpmbec']
mhc1_predictors = ['basicmhc1','netmhcpan','iedbmhc1','mhcflurry'] + iedbmhc1_methods
iedbsettings = {'cutoff_type': 'none', 'pred_method': 'IEDB_recommended',
            'output_format': 'ascii', 'sort_output': 'position_in_sequence',
            'sequence_format': 'auto', 'allele': 'HLA-DRB1*01:01', 'length':'11',
            'sequence_file': None}

mhc1_presets = ['mhc1_supertypes','us_caucasion_mhc1','us_african_mhc1','broad_coverage_mhc1']
mhc2_presets = ['mhc2_supertypes','human_common_mhc2','bovine_like_mhc2']

#sequence for testing
testsequence = ('MRRVILPTAPPEYMEAIYPVRSNSTIARGGNSNTGFLTPESVNGDTPSNPLRPIADDTIDHASHTPGSVS'
               'SAFILEAMVNVISGPKVLMKQIPIWLPLGVADQKTYSFDSTTAAIMLASYTITHFGKATNPLVRVNRLGP'
               'GIPDHPLRLLRIGNQAFLQEFVLPPVQLPQYFTFDLTALKLITQPLPAATWTDDTPTGSNGALRPGISFH'
               'PKLRPILLPNKSGKKGNSADLTSPEKIQAIMTSLQDFKIVPIDPTKNIMGIEVPETLVHKLTGKKVTSKN'
               'GQPIIPVLLPKYIGLDPVAPGDLTMVITQDCDTCHSPASLPAVIEK')
presets_dir = os.path.join(module_path, 'presets')

#genome for testing
mtb_genome = os.path.join(datadir, 'MTB-H37Rv.gb')

[docs]def read_defaults():
    """Get some global settings such as program paths from config file"""

    cp = config.parse_config(config_file)
    options = config.get_options(cp)
    return options

#get defaults
defaults = read_defaults()

[docs]def predict_proteins_worker(P, recs, kwargs):
    try:
        df = P._predict_sequences(recs, **kwargs)
    except KeyboardInterrupt:
        return
    return df

[docs]def predict_peptides_worker(P, recs, kwargs):
    try:
        df = P._predict_peptides(recs, **kwargs)
    except KeyboardInterrupt:
        return
    return df

[docs]def get_preset_alleles(name):
    """A list of the possible preset alleles """
    df = pd.read_csv(os.path.join(presets_dir, name+'.csv'),comment='#')
    return list(df.allele)

[docs]def first(x):
    return x.iloc[0]

[docs]def get_pos(x):
    x['pos'] = np.arange(len(x))
    return x

[docs]def get_iedb_request(seq, alleles='HLA-DRB1*01:01', method='consensus3'):
    import requests
    url = 'http://tools.iedb.org/tools_api/mhcii/'
    values = {'method' : method,
              'sequence_text' : seq,
              'allele' : alleles }
    r=requests.post(url,data=values)
    df=pd.read_csv(io.StringIO(r.content),sep='\t')
    return df

[docs]def get_overlapping(index, s, length=9, cutoff=25):
    """Get all mutually overlapping kmers within a cutoff area"""

    g=[s]
    vals = [i for i in range(s, s+cutoff) if i in index]
    for i in range(len(vals)-1):
        if vals[i+1]<=vals[i]+length:
            g.append(vals[i+1])
        else:
            break
    return g

[docs]def get_predictor_classes():
    """Get predictor classes in this module."""

    cl={}
    for o in Predictor.__subclasses__():
        cl[o._get_name()] = o
    return cl

[docs]def get_predictor(name='tepitope', **kwargs):
    """Get a predictor object using it's name. Valid predictor names
       are held in the predictors attribute."""

    cl = get_predictor_classes()
    if name in iedbmhc1_methods:
        name = 'iedbmhc1'
    if name in cl:
        return cl[name](**kwargs)
    else:
        print ('no such predictor %s' %name)
        print ('valid names are %s' %', '.join(predictors))
        return

[docs]def get_length(data):
    """Get peptide length of a dataframe of predictions"""

    if len(data)>0:
        return len(data.head(1).peptide.max())
    return

[docs]def get_coords(df):
    """Get start end coords from position and length of peptides"""

    if 'start' in df.columns:
        return df
    df['start'] = df.pos.astype(int)
    df['end'] = ( df.pos + df.peptide.str.len() ).astype(int)
    return df

[docs]def sequence_from_peptides(df):
    """Derive sequence from set of peptides"""

    l = get_length(df)
    df = df.drop_duplicates('pos').sort_values('pos')
    x = df.peptide.str[0]
    last = df.iloc[-1].peptide[1:]
    x = ''.join(x)
    x = x + last
    return x

[docs]def seq_from_binders(df):
    x = pd.Series(df.sort_values('pos').peptide.unique())
    return ''.join(x.str[0])

[docs]def write_fasta(sequences, id=None, filename='tempseq.fa'):
    """Write a fasta file of sequences"""
    
    if isinstance(sequences, basestring):
        sequences = [sequences]
    out = open(filename, 'w')
    i=1
    for seq in sequences:
        if id == None:
            id='temp%s'%i
        SeqIO.write(SeqRecord(Seq(seq),id,
                    description='temp'), out, 'fasta')
        i+=1
    out.close()
    return filename

[docs]def get_sequence(seqfile):
    """Get sequence from fasta file"""

    recs = list(SeqIO.parse(seqfile, 'fasta'))[0]
    sequence = recs.seq.tostring()
    return sequence

[docs]def clean_sequence(seq):
    """clean a sequence of invalid characters before prediction"""
    import re
    new = re.sub('[-*_#X]', '', seq)
    return new

[docs]def get_nearest(df):
    """Get nearest binder"""

    grps = df.groupby('name')
    new = []
    def closest(x,g):
        if len(g.pos)==1:
            return 1
        return min([abs(x.pos-i) for i in g.pos if i!=x.pos])
    for i,g in grps:
        positions = g.pos
        g['nearest'] = g.apply(lambda x: closest(x,g),axis=1)
        new.append(g)
    df = pd.concat(new)
    return df

[docs]def summarize(data):
    """Summarise prediction data"""

    #print 'binders with unique cores: %s' %len(self.getUniqueCores(binders=True))
    allelegrps = data.groupby('allele')
    proteins = data.groupby('name')
    print ('summary: %s peptides in %s proteins and %s alleles' %(len(data),
                                        len(proteins),len(allelegrps)))
    return

[docs]def get_filenames(path, names=None, file_limit=None):

    if not os.path.exists(path):
        print('no such path %s' %path)
        return
    files = glob.glob(os.path.join(path, '*.csv'))
    if names is not None:
        names = [n+'.csv' for n in names]
        files = [n for n in files if os.path.basename(n) in names]
    if len(files) == 0:
        return
    if file_limit != None:
        files = files[:file_limit]
    return files

[docs]def results_from_csv(path=None, names=None, compression='infer', file_limit=None):
    """
    Load results for multiple csv files in a folder or a single file.
    Args:
        path: name of a csv file or directory with one or more csv files
        names: names of proteins to load
        file_limit: limit to load only the this number of proteins
    """

    if os.path.isfile(path):
        data = pd.read_csv(path, index_col=0)
    elif os.path.isdir(path):
        files = get_filenames(path, names, file_limit)
        if files == None:
            return
        res = []
        for f in files:
            df = pd.read_csv(f, index_col=0, compression=compression)
            if len(df) == 0:
                continue
            #if not self.scorekey in df.columns:
            #    continue
            res.append(df)
        data = pd.concat(res)
    else:
        return
    return data

[docs]def get_quantiles(predictor):
    """Get quantile score values per allele in set of predictions.
    Used for making pre-defined cutoffs.
    Args:
        predictor: predictor with set of predictions
    """

    P=predictor
    res=[]
    value = P.scorekey
    data=P.data
    for a,g in data.groupby('allele'):
        q = g[value].quantile(np.arange(0,1,.01))
        q.name=a
        res.append(q)
    res = pd.DataFrame(res)
    if P.rankascending==1:
        res.columns = res.columns.sort_values(ascending=False)
    return res

[docs]def get_standard_mhc1(name):
    """Taken from iedb mhc1 utils.py"""

    temp = name.strip().split('-')
    length = temp[-1]
    mhc = '-'.join(temp[0:-1])
    return mhc

[docs]def get_drb_list(a):
    """Get DRB list in standard format"""

    s = pd.Series(a)
    s = s[s.str.contains('DRB')]
    s = s.apply(lambda x:'HLA-'+x.replace('_','*'))
    return list(s)

[docs]def get_dqp_list(a):
    """Get DRB list in standard format"""

    s = pd.Series(a)
    s = s[s.str.contains('DQ')]
    s = s.apply(lambda x:x.replace('_','*'))
    return list(s)

[docs]def get_standard_mhc2(x):
    return 'HLA'+x.replace('_','*')

[docs]def compare_predictors(p1, p2, by='allele', cutoff=5, n=2):
    """
    Compare predictions from 2 different predictors.
    Args:
        p1, p2: predictors with prediction results for the same
        set of sequences andalleles
        by: how to group the correlation plots
    """

    import pylab as plt
    import seaborn as sns
    a = p1.promiscuous_binders(n=n, cutoff=cutoff)
    b = p2.promiscuous_binders(n=n, cutoff=cutoff)
    f = utilities.venndiagram([a.peptide, b.peptide],[p1.name,p2.name],colors=('y','b'))
    f.suptitle('common\npromiscuous binders n=%s' %n)
    plt.tight_layout()

    for p in [p1,p2]:
        if not 'score' in p.data.columns:
            p.data['score'] = p.data[p.scorekey]

    #b1 = p1.get_binders(perc=perc)
    #b2 = p2.get_binders(perc=perc)
    #o = analysis.getOverlaps(b1,b2)
    #merge data for plotting score correlation
    df = pd.merge(p1.data, p2.data, on=['peptide','allele','name','pos'])
    g = sns.lmplot(x='score_x', y='score_y', data=df, col=by, ci=None, #robust=True,
               hue=by, col_wrap=3, markers='o', scatter_kws={'alpha':0.5,'s':30})

    g.set_axis_labels(p1.name, p2.name)
    #g.set(xlim=(0, 1))
    #g.set(ylim=(0, 1))
    #plotting.plot_tracks([pi,pf],'MAP_0005',n=2,perc=0.97,legend=True,colormap='jet')
    return

[docs]def reshape_data(pred, peptides=None, name=None, values='score'):
    """
    Create summary table per binder/allele with cutoffs applied.
    Args:
        pred: predictor with data
        cutoff: percentile cutoff
        n: number of alleles
    """

    df = pred.data
    idx = ['name','pos','peptide']
    if name != None:
        df = df[df.name==name]
        idx = ['pos','peptide']
    if peptides is not None:
        df = df[df.peptide.isin(peptides)]
    p = df.pivot_table(index=idx, columns='allele', values=values)
    p = p.round(3)
    #get all in p > cutoff for that allele
    order = p.mean(1).sort_values()
    p = p.reindex_axis(order.index, axis=0)
    return p

[docs]def plot_summary_heatmap(p, kind='default', name=None):
    """
    Plot heatmap of binders using summary dataframe.
    """

    import pylab as plt
    import seaborn as sns
    sns.set_context("notebook", font_scale=1.2)
    plt.rcParams['savefig.dpi']=100
    h = int(2 + .2 *len(p))
    #print h
    plt.figure(figsize=(10,h))
    ax = sns.heatmap(p, cbar_kws={"shrink": .5})
    #g = sns.clustermap(p, figsize=(10,h), standard_scale=1)#, col_cluster=False)
    #plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
    if name != None:
        ax.set_title(name)
    return ax

[docs]def protein_summary(pred, peptides, name):
    """formatted protein summary table"""

    x = reshape_data(pred, peptides, name=name)
    return x.style.background_gradient(cmap='Reds', high=.2).highlight_null('white')

[docs]def summarize_by_protein(pred, pb):
    """Heatmaps or tables of binders per protein/allele"""

    for i,g in pb.groupby('name'):
        x = binders_allele_summary(pred, g.peptide, values='score', name=i)
        ax=plot_summary_heatmap(x, name=i)

[docs]def split_peptides(df,length=9,seqkey='sequence',newcol='peptide'):
    """Split sequences in a dataframe into peptide fragments"""

    res=[]
    for n,r in df.iterrows():
        seq = r[seqkey]
        p = peptutils.get_fragments(seq,1,length)
        p = p.rename(columns={'peptide':newcol})
        p['name']=n
        p[seqkey] = seq
        p=p.set_index('name')
        res.append(p)
    res = pd.concat(res)
    res = df.merge(res,on=seqkey)
    return res

[docs]def check_snap():
    """Check if inside a snap"""

    if 'SNAP_COMMON' in os.environ:
        return True
    return False

[docs]def set_netmhcpan_cmd(path=None):
    """Setup the netmhcpan command to point directly to the binary. This is a
    workaround for running inside snaps. Avoids using the tcsh script."""

    toolspath = os.path.join('/home', os.environ['USER'], 'tools')
    netmhcpath = os.path.join(toolspath, 'netMHCpan-4.0/Linux_x86_64')
    if not os.path.exists(netmhcpath):
        print ('you need to copy the netmhcpan files to %s' %toolspath)
    os.environ['NETMHCpan']=netmhcpath
    os.environ['TMPDIR']= '/tmp'
    cmd = os.path.join(netmhcpath, 'bin/netMHCpan')
    #print (os.environ)
    return cmd

[docs]class DataFrameIterator:
    """Simple iterator to get dataframes from a path out of memory"""
    def __init__(self, files):
        self.files = files
        self.current = 0

    def __iter__(self):
        return self

    def __next__(self):
        if self.current >= len(self.files):
            raise StopIteration
        else:
            df=pd.read_csv(self.files[self.current],index_col=0)
            self.current += 1
            return df

[docs]    def next(self):
        return self.__next__()

    def __repr__(self):
        return 'DataFrameIterator with %s files' %len(self.files)

[docs]class Predictor(object):
    """Base class to handle generic predictor methods, usually these will
       wrap methods from other modules and/or call command line predictors.
       Subclass for specific functionality"""

    def __init__(self, data=None):
        self.data = data
        self.name = ''
        self.scorekey = 'score'
        self.operator = '<'
        self.rankascending = 0
        #can specify per allele cutoffs here
        self.allelecutoffs = None
        self.temppath = tempfile.mkdtemp()
        self.path = None
        return

    @classmethod
    def _get_name(self):
        return 'base class'

    def __repr__(self):

        if (self.data is None) or len(self.data) == 0:
            return '%s predictor' %self.name
        else:
            n = len(self.data.name.unique())
            return '%s predictor with results in %s sequences' %(self.name, n)

    def check_install(self):
        return True

[docs]    def supported_lengths(self):
        """Return supported peptide lengths"""
        return [9,11]

[docs]    def predict(self, sequence=None, peptides=None, length=9, overlap=1,
                    allele='', name=''):
        """Does the actual scoring of a sequence. Should be overriden.
           Should return a pandas DataFrame"""
        return

[docs]    def prepare_data(self, result, name, allele):
        """Put raw prediction data into DataFrame and rank,
           override for custom processing. Can be overriden for
           custom data."""

        df = pd.DataFrame(result, columns=['peptide','core','pos','score'])
        df['name'] = name
        df['allele'] = allele
        self.get_ranking(df)
        return df

[docs]    def get_ranking(self, df):
        """Add a ranking column according to scorekey"""

        s=self.scorekey
        df['rank'] = df[s].rank(method='min',ascending=self.rankascending)
        df.sort_values(by=['rank','name','allele'], ascending=True, inplace=True)
        return df

[docs]    def evaluate(self, df, key, value, operator='<'):
        """
        Evaluate binders less than or greater than a cutoff.
        This method is called by all predictors to get binders
        """

        if operator == '<':
            return df[df[key] <= value]
        else:
            return df[df[key] >= value]

[docs]    def get_quantile_data(self):
        """Get peptide score rank from quantile data."""

        path = os.path.join(datadir, 'quantiles_%s.csv' %self.name)
        qf = pd.read_csv(path,index_col=0)
        qf.columns = qf.columns.astype('float')
        return qf

[docs]    def get_global_rank(self, score, allele):
        """Get an allele specific score percentile from precalculated quantile data."""

        qf = self.qf
        s = qf.loc[allele]
        cutoff = np.abs(s - score).idxmin()
        return cutoff

[docs]    def get_allele_cutoffs(self, cutoff=.95):
        """Get per allele percentile cutoffs using precalculated quantile vales."""

        qf = self.qf
        cutoff = round(cutoff,2)
        #if self.rankascending == 1:
        #    cutoff = round(1-cutoff,2)
        if not cutoff in qf.columns:
            print ('please use values between 0 and 1 rounded to 2 decimal places e.g. .95 (95% cutoff)')
        return qf[cutoff]

    def _per_allele_binders(self, data, cuts):
        """Return binders per allele based cutoffs"""

        value = self.scorekey
        res=[]
        for a,g in data.groupby('allele'):
            if a not in cuts:
                print ('%s not in cutoffs' %a)
                continue
            if self.rankascending == 0:
                b = g[g[value]>=cuts[a]]
            else:
                b = g[g[value]<cuts[a]]
            res.append(b)
        return pd.concat(res)

    def _ranked_binders(self, data, cutoff):
        """return binders by rank which has been calculated per sequence/allele"""
        return data[data['rank'] <= cutoff]

    def _score_binders(self, data, cutoff):
        """return binders by global single score cutoff"""
        if self.rankascending == 0:
            res = data[data[self.scorekey] >= cutoff]
        else:
            res = data[data[self.scorekey] <= cutoff]
        return res

[docs]    def get_scores(self, allele):
        """Return peptides and scores only for an allele"""

        if self.data is None or len(self.data)==0:
            return
        df = self.data[self.data.allele==allele]
        sc = df[['peptide','score']]
        return sc

[docs]    def get_binders(self, cutoff=.95, cutoff_method='default', path=None,
                    name=None, drop_columns=False, limit=None, **kwargs):
        """
        Get the top scoring binders. If using default cutoffs are derived
        from the pre-defined percentile cutoffs for some known antigens. For
        per protein cutoffs the rank can used instead. This will give
        slightly different results.
        Args:
            path: use results in a path instead of loading at once, conserves memory
            cutoff: percentile cutoff (default), absolute score or a rank value within each sequence
            cutoff_method: 'default', 'score' or 'rank'
            name: name of a specific protein/sequence
        Returns:
            binders above cutoff in all alleles, pandas dataframe
        """

        if cutoff_method not in ['default', 'global', 'score', 'rank']:
            print ('choose a valid cutoff method')
            return
        cutoff = float(cutoff)
        data = self.data
        if cutoff_method in ['default','global','']:
            #by per allele percentile cutoffs
            cuts = self.get_allele_cutoffs(cutoff)
        #print (cuts)
        if path is not None:
            #get binders out of memory for large datasets
            files = get_filenames(path)
            if files == None:
                return

            if name is not None:
                files = [f for f in files if f.find(name+'.csv')!=-1]
            d = DataFrameIterator(files)
            res=[]
            for data in d:
                if cutoff_method in ['default','global','']:
                    b = self._per_allele_binders(data, cuts)
                elif cutoff_method == 'rank':
                    b = self._ranked_binders(data, cutoff)
                elif cutoff_method == 'score':
                    b = self._score_binders(data, cutoff)
                res.append(b)
            res = pd.concat(res)
        else:
            if data is None:
                return
            if cutoff_method in ['default','global','']:
                res = self._per_allele_binders(data, cuts)
            elif cutoff_method == 'rank':
                res = self._ranked_binders(data, cutoff)
            elif cutoff_method == 'score':
                res = self._score_binders(data, cutoff)
            names = list(res['name'].unique())
            if name != None and name in names:
                res = res[res.name == name]

        if limit != None:
            res = res.groupby('name').head(limit)
        return res

[docs]    def promiscuous_binders(self, binders=None, name=None, cutoff=.95,
                           cutoff_method='default', n=1, unique_core=True, limit=None,
                            **kwargs):
        """
        Use params for getbinders if no binders provided?
        Args:
            binders: can provide a precalculated list of binders
            name: specific protein, optional
            value: to pass to get_binders
            cutoff_method: 'rank', 'score' or 'global'
            cutoff: cutoff for get_binders (rank, score or percentile)
            n: min number of alleles
            unique_core: removes peptides with duplicate cores and picks the most
            limit: limit the number of peptides per protein, default None
            promiscuous and highest ranked, used for mhc-II predictions
        Returns:
            a pandas dataframe
        """

        n=int(n)
        if binders is None:
            binders = self.get_binders(name=name, cutoff=cutoff, cutoff_method=cutoff_method)

        if binders is None:
            print('no binders found, check that prediction data is present')
            return
        if 'core' not in binders.columns :
            binders['core'] = binders.peptide
        if self.operator == '<':
            func = min
            skname = 'min'
        else:
            func = max
            skname = 'max'

        s = binders.groupby(['peptide','pos','name']).agg({'allele': pd.Series.count,
                            'core': first, self.scorekey:[func,np.mean],
                            'rank': np.median})
        s.columns = s.columns.get_level_values(1)
        s.rename(columns={skname: self.scorekey, 'count': 'alleles','median':'median_rank',
                         'first':'core'}, inplace=True)
        s = s.reset_index()
        s = s.sort_values(['alleles','median_rank',self.scorekey],
                          ascending=[False,True,self.rankascending])
        #if we just want unique cores, drop duplicates takes most promiscuous in each group
        #since we have sorted by alleles and median rank
        if unique_core == True:
            s = s.drop_duplicates('core')
        s = s[s.alleles>=n]
        if limit != None:
            s = s.groupby('name').head(limit)
        return s

[docs]    def ranked_binders(self, names=None, how='median', cutoff=None):
        """
        Get the median/mean rank of each binder over all alleles.
        Args:
            names: list of protein names, otherwise all current data used
            how: method to use for rank selection, 'median' (default),
            'best' or 'mean',
            cutoff: apply a rank cutoff if we want to filter (optional)
        """

        df = self.data
        if names != None:
            if names is str:
                names = [names]
            df=df[df.name.isin(names)]
        funcs = { 'median':np.median, 'mean':np.mean, 'best':min }
        func = funcs[how]
        b = df.groupby(['peptide']).agg({'rank': func,'pos':first, 'name':first,
                                         self.scorekey: np.median})
        #b.columns = b.columns.get_level_values(0)
        b = b.reset_index().sort_values('rank')
        if cutoff != None:
            b = b[b['rank'] < cutoff]
        return b

[docs]    def get_unique_cores(self, binders=False):
        """Get only unique cores"""

        if binders == True:
            df = self.get_binders()
        else:
            df = self.data
        grouped = df.groupby('core')
        cores = grouped.agg({self.scorekey:max})
        #cores = df.loc[grouped[self.scorekey].max().index]
        cores.sort(self.scorekey, inplace=True, ascending=self.rankascending)
        #print cores
        return cores

[docs]    def seqs_to_dataframe(self, seqs):
        df = pd.DataFrame(seqs, columns=['peptide'])
        df['name'] = df.peptide
        df['pos'] = range(len(seqs))
        return df

    def _predict_peptides(self, peptides, alleles=[], verbose=False, drop_columns=False, **kwargs):
        """
        Predict a set of arbitary peptide sequences in a list or dataframe.
        These are treated as individual peptides and not split into n-mers. This is
        usually called wrapped by predict_peptides. If the predict method for the class
        can only accept protein sequences this needs to be overriden.
        Args:
            peptides: list of peptides
            alleles: list of alleles to predict
            drop_columns: only keep default columns
        Returns:
            dataframe with results
        """

        res=[]
        if len(alleles)==0:
            print ('no alleles specified')
            return
        if type(alleles) is str:
            alleles = [alleles]

        #remove empty values
        #peptides = [i for i in peptides if type(i) is str or type(i) is unicode]
        peptides = [i for i in peptides if isinstance(i, basestring)]
        #print (peptides)

        for a in alleles:
            df = self.predict(peptides=peptides, allele=a, **kwargs)
            if df is None or len(df) == 0:
                continue
            #if keep_empty == True:
                 #df = s.merge(df,on=['peptide'],how='left')
                 #df['allele'] = df.allele.ffill()
                 #df['pos'] = df.index.copy()
            res.append(df)
            if verbose == True and len(df)>0:
                x = df.iloc[0]
                r = self.format_row(x)
                print (r)

        if len(res) == 0:
            return

        data = pd.concat(res)
        #drop non-standard columns for cleaner output
        if drop_columns == True:
            defaultcols = ['pos','peptide',self.scorekey,'allele','rank','name']
            data= data[defaultcols]
        #print (data)
        self.data = data
        return data

[docs]    def predict_peptides(self, peptides, threads=1, path=None, overwrite=True, name=None, **kwargs):
        """Predict a set of individual peptides without splitting them.
        This is a wrapper for _predict_peptides to allow multiprocessing.
        Args:
            peptides: list of peptides
            alleles: list of alleles to predict
            drop_columns: only keep default columns
        Returns:
            dataframe with results
        """

        if path is not None:
            fname = os.path.join(path, 'results_%s.csv' %self.name)
            if overwrite == False and os.path.exists(fname):
                #load the data if present and not overwriting
                print ('results file found, not overwriting')
                self.path = fname
                return

        #store original peptide list as dataframe to keep order
        s = pd.DataFrame(peptides, columns=['peptide'])
        s['pos'] = s.index.copy()

        if threads == 1:
            data = self._predict_peptides(peptides, **kwargs)
        else:
            data = self._run_multiprocess(peptides, worker=predict_peptides_worker,
                                          threads=threads, **kwargs)
        if data is None or len(data) == 0:
            print ('empty result returned')
            return

        #get original peptide positions by re-merging with original peptide dataframe
        #also allows us to keep empty rows in correct order
        #make this optional as it could cause issues?
        new = []
        for i,g in data.groupby('allele'):
            #print (len(g))
            #print (s)
            #print (g)
            if 'pos' in g.columns:
                g=g.drop('pos',1)
            x = s.merge(g,on=['peptide'],how='left').drop_duplicates(['pos','peptide'])
            #x['pos'] = x.index.copy()
            x['allele'] = x.allele.fillna(i)
            new.append(x)
            #print (len(x))
            #print (x)

        data = pd.concat(new).reset_index(drop=True)
        data = data.groupby('allele').apply(self.get_ranking) #move this to loop
        data = data.reset_index(drop=True)

        if name==None:
            name='temp'
        if 'name' not in data.columns:
            data['name'] = name
        #print (data)
        if path is not None:
            data.to_csv(fname, float_format='%g')
            self.path = fname
        else:
            self.data = data
        return data

    def _convert_to_dataframe(self, recs):
        """convert sequence list input to dataframe"""
        if type(recs) is str:
            recs = [recs]
        if type(recs) is list:
            idx = [str(i) for i in range(len(recs))]
            recs = pd.DataFrame(zip(idx,recs), columns=['locus_tag','translation'])
        return recs

[docs]    def predict_proteins(self, args, **kwargs):
        """Alias to predict_sequences"""
        results = self.predict_sequences(args, **kwargs)
        return results

[docs]    def predict_sequences(self, recs, alleles=[], path=None, verbose=False,
                          names=None, key='locus_tag', seqkey='translation', threads=1, **kwargs):
        """
        Get predictions for a set of proteins over multiple alleles that allows
        running in parallel using the threads parameter.
        This is a wrapper for _predictSequences with the same args.

          Args:
            recs: list or dataframe with sequences
            path: if provided, save results to this file
            threads: number of processors
            key: seq/protein name key
            seqkey: key for sequence column
            length: length of peptide to split sequence into
          Returns:
            a dataframe of predictions over multiple proteins
        """

        recs = self._convert_to_dataframe(recs)
        if names is not None:
            recs = recs[recs[key].isin(names)]
        if len(recs) == 0:
            print ('no records to predict')
            return
        if verbose == True:
            self.print_heading()
        #print (threads)
        if threads == 1:
            results = self._predict_sequences(recs, alleles=alleles, path=path,
                    verbose=verbose, **kwargs)
        else:
            results = self._run_multiprocess(recs, alleles=alleles, path=path,
                    verbose=verbose, threads=threads, **kwargs)

        print ('predictions done for %s sequences in %s alleles' %(len(recs),len(alleles)))
        if path is None:
            #if no path we assign results to the data attribute
            self.data = results
        else:
            print ('results saved to %s' %os.path.abspath(path))
            results = None
        self.cleanup()
        return results

    def _predict_sequences(self, recs, path=None, overwrite=True, alleles=[], length=11, overlap=1,
                          verbose=False, compression=None, **kwargs):
        """
        Get predictions for a set of proteins and/or over multiple alleles.
        Sequences should be put into
        a dataframe and supplied to this method as the first argument.
            Args:
                recs: protein sequences in a pandas DataFrame
                path: if results are to be saved to disk provide a path
                overwrite: over write existing protein files in path if present
                alleles: allele list
                length: length of peptides to predict
                overlap: overlap of n-mers
                verbose: provide output per protein/sequence
            Returns: a dataframe of the results if no path is given
        """

        if type(alleles) is str:
            alleles = [alleles]
        elif type(alleles) is pd.Series:
            alleles = alleles.tolist()
        #self.check_alleles(alleles)
        if len(alleles) == 0:
            print ('no alleles provided')
            return

        #recs = self._convert_to_dataframe(recs)
        results = []
        if path is not None and path != '':
            if not os.path.exists(path):
                try:
                    os.mkdir(path)
                except (OSError, FileExistsError):
                    pass

        results = []
        self.length = length
        if compression == '':
            compression = None
        for i,row in recs.iterrows():
            seq = row.translation
            seq = clean_sequence(seq) #clean the sequence of non-aa characters
            #print (row)
            name = row.locus_tag
            if path is not None:
                fname = os.path.join(path, name+'.csv')
                if os.path.exists(fname) and overwrite == False:
                    continue
            res = []
            peptides, s = peptutils.create_fragments(seq=seq, length=length, overlap=overlap)
            for a in alleles:
                #we pass sequence, length, overlap for predictors that have to run on whole seq
                df = self.predict(peptides=peptides, sequence=seq, allele=a, name=name,
                                  length=length, overlap=overlap, **kwargs)
                if df is not None:
                    res.append(df)
                else:
                    continue
                if verbose == True and len(df)>0:
                    x = df.iloc[0]
                    s = self.format_row(x)
                    print (s)
            if len(res) == 0:
                continue
            res = pd.concat(res, sort=True)

            if path is not None and len(res)>0:
                if compression == 'gzip':
                    fname = fname+'.gz'
                res.to_csv(fname, compression=compression)
            else:
                results.append(res)
        #print (len(recs))
        if len(results)>0:
            results = pd.concat(results)
        return results

[docs]    def print_heading(self):
        s = ("{:<25} {:<16} {:<18} {:<}"
                           .format('name','allele','top peptide','score'))
        print (s)
        return

[docs]    def format_row(self, x):
        s = ("{:<25} {:<16} {:<18} {:} "
                           .format(x['name'], x.allele, x.peptide, x[self.scorekey] ))
        return s

    def _run_multiprocess(self, recs, threads=2, worker=None, **kwargs):
        """
        Call function with multiprocessing pools. Used for running predictions
        in parallel where the main input is a pandas dataframe.
        Args:
            recs: input dataframe
            threads: number of cores to use
            worker: function to be run in parallel
        Returns:
            concatenated result, a pandas dataframe
           """

        if worker is None:
            worker = predict_proteins_worker
            #print (worker)
        import multiprocessing as mp
        maxcpu = mp.cpu_count()
        if threads == 0 or threads > maxcpu:
            threads = maxcpu
        if threads >= len(recs):
            threads = len(recs)
        pool = mp.Pool(threads)
        funclist = []
        st = time.time()
        chunks = np.array_split(recs,threads)
        #print ([len(i) for i in chunks])

        for recs in chunks:
            f = pool.apply_async(worker, [self,recs,kwargs])
            funclist.append(f)
        result = []
        try:
            for f in funclist:
                df = f.get(timeout=None)
                #print (df)
                if df is not None and len(df)>0:
                    result.append(df)
        except KeyboardInterrupt:
            print ('process interrupted')
            pool.terminate()
            sys.exit(0)
        pool.close()
        pool.join()

        #print (result)
        if len(result)>0:
            result = pd.concat(result)
            #print result.info()
        t=time.time()-st
        if t>10:
            print ('took %s seconds' %str(round(t,3)))
        return result

[docs]    def load(self, path=None, names=None, compression='infer', file_limit=None):
        """Load results from path or single file. See results_from_csv for args."""

        if path is None and self.path != None:
            #if path attribute is set no path arg we use that
            path = self.path
        data = results_from_csv(path, names, compression, file_limit)
        if data is None:
            print ('no data found in path %s' %path)
        else:
            self.data = data
        if not self.scorekey in data.columns:
            print ('WARNING. this does not look like the correct data for this predictor')
        return

[docs]    def save(self, prefix='_', filename=None, compression=None):
        """
        Save all current predictions dataframe with some metadata
        Args:
            prefix: if writing to a path, the prefix name
            filename: if saving all to a single file
            compression: a string representing the compression to use,
            allowed values are 'gzip', 'bz2', 'xz'.
        """

        exts = {'gzip':'.gz','bz2':'.bz2','xz':'.xz'}
        if filename != None:
            cext = exts[compression]
            if compression != None and not filename.endswith(cext):
                filename += cext
            self.data.to_csv(filename, compression=compression)
        else:
            #save one file per protein/name
            ext = '.csv'
            path = os.path.join(prefix, self.name)
            print ('saving to %s' %path)
            if not os.path.exists(path):
                os.makedirs(path)
            for name,df in self.data.groupby('name'):
                outfile = os.path.join(path, name+ext)
                df.to_csv(outfile)
        return

[docs]    def save_msgpack(self, filename=None):
        """Save as msgpack format - experimental"""

        if filename == None:
            filename = 'epit_%s_%s_%s.msg' %(label,self.name,self.length)
        print ('saving as %s' %filename)
        meta = {'method':self.name, 'length':self.length}
        pd.to_msgpack(filename, meta)
        for i,g in self.data.groupby('name'):
            pd.to_msgpack(filename, g, append=True)
        return

[docs]    def summarize(self):
        """Summarise currently loaded data"""

        return summarize(self.data)

[docs]    def allele_summary(self, cutoff=5):
        """Allele based summary"""

        b = self.get_binders(cutoff=cutoff)
        s = b.groupby('allele').agg({'peptide':np.size,self.scorekey:[np.median,np.mean]})
        s.columns = s.columns.get_level_values(1)
        return s

[docs]    def protein_summary(self):
        print ( self.data.groupby('name').agg({'peptide':np.size}) )

[docs]    def proteins(self):
        return list(self.data.name.unique())

[docs]    def reshape(self, name=None):
        """Return pivoted data over alleles for summary use"""

        df = self.data
        if name != None:
            df = df[df.name==name]
        p = df.pivot_table(index='peptide', columns='allele', values=self.scorekey)
        p = p.reset_index()
        x = list(df.groupby('allele'))[0][1]
        p = p.merge(x[['pos','peptide']],on='peptide')
        p['mean'] = p.mean(1)
        p=p.sort('mean',ascending=self.rankascending)
        return p

[docs]    def get_names(self):
        """Get names of sequences currently stored as predictions"""

        grp = self.data.groupby('name')
        return sorted(dict(list(grp)).keys())

[docs]    def plot(self, name, **kwargs):
        """
        Use module level plotting.mpl_plot_tracks method for predictor plot
        Args:
            name:
            n: min no. of alleles to be visible
            perc: percentile cutoff for score
            cutoff_method: method to use for cutoffs
        """

        from . import plotting
        if name == None:
            return
        plot = plotting.plot_tracks([self], name=name, **kwargs)
        return plot

[docs]    def get_alleles(self):
        """Get available alleles - override"""
        return []

[docs]    def check_alleles(self, alleles):
        a = self.get_alleles()
        #print (a)
        found = list((set(a) & set(alleles)))
        return found

[docs]    def cleanup(self):
        """Remove temp files from predictions"""

        if os.path.exists(self.temppath):
            shutil.rmtree(self.temppath)
        return

[docs]    def check_install(self):
        return True

[docs]class NetMHCPanPredictor(Predictor):
    """netMHCpan 4.1b predictor
    see http://www.cbs.dtu.dk/services/NetMHCpan/
    Default scoring is affinity predictions.
    To get newer scoring behaviour pass scoring='ligand' to constructor.
    """
    def __init__(self, data=None, scoring='affinity'):

        Predictor.__init__(self, data=data)
        self.name = 'netmhcpan'
        self.colnames = ['pos','allele','peptide','core','Of','Gp','Gl','Ip','Il','Icore',
                         'name','raw_score','ic50','rank']
        self.scorekey = 'score'
        self.scoring = scoring
        if self.scoring =='affinity':
            self.cutoff = 500
            self.operator = '<'
            self.rankascending = 1
        elif scoring == 'ligand':
            self.cutoff = 0.5
            self.operator = '>'
            self.rankascending = 0
        #load precalculated quantiles for sample peptides
        self.qf = self.get_quantile_data()
        self.basecmd = 'netMHCpan'
        #base command needs to be to the binary directly if running snap
        if check_snap() is True:
            self.basecmd = set_netmhcpan_cmd()

[docs]    def read_result(self, temp):
        """Read raw results from netMHCpan 4.1b output"""

        ignore = ['Pos','#','Protein','']
        cols = ['pos','allele','peptide','core','Of','Gp','Gl','Ip','Il','Icore',
                 'Identity','Score_EL','%Rank_EL','Score_BA','%Rank_BA','ic50','BindLevel']
        res = pd.read_csv(io.BytesIO(temp), comment='#', names=cols, sep='\s+',
                          error_bad_lines=False, skiprows=46).dropna(subset=['peptide'])
        res = res[~res.pos.isin(ignore)]
        #print (res)
        return res

[docs]    def read_result_legacy(self, temp):
        """Read raw results from netMHCpan 4.0 output"""

        ignore = ['Pos','#','Protein','']
        if self.scoring == 'affinity':
            cols = ['pos','allele','peptide','core','Of','Gp','Gl','Ip','Il','Icore',
                    'Identity','Score','ic50','%Rank','X','BindLevel']
        else:
            cols = ['pos','allele','peptide','core','Of','Gp','Gl','Ip','Il','Icore',
                    'Identity','Score','%Rank','X','BindLevel']
        res = pd.read_csv(io.BytesIO(temp), comment='#', names=cols, sep='\s+',
                          error_bad_lines=False, skiprows=46).dropna(subset=['peptide'])
        res = res.drop(columns='X')
        res = res[~res.pos.isin(ignore)]
        #print (res)
        return res

[docs]    def prepare_data(self, df, name):
        """Prepare netmhcpan results"""

        df = df.apply( lambda x: pd.to_numeric(x, errors='ignore').dropna())
        df['name'] = name
        if self.scoring =='affinity':
            df['score'] = df.ic50
        else:
            df['score'] = df.Score_EL
        df=df.reset_index(drop=True)
        df['pos'] = df.index.copy()+1
        df = df.dropna(how='all')
        self.get_ranking(df)
        self.data = df
        return

[docs]    def predict(self, peptides, allele='HLA-A*01:01', name='temp',
                pseudosequence=None, show_cmd=False, **kwargs):
        """Call netMHCpan command line.  """

        allele = allele.replace('*','')
        #write peptides to temp file
        pepfile = tempfile.mktemp()+'.pep'
        with open(pepfile ,'w') as f:
            for p in peptides:
                f.write(p+'\n')
        f.close()
        if self.scoring =='affinity':
            cmd = '%s -BA -f %s -inptype 1 -a %s' %(self.basecmd, pepfile , allele)
        else:
            cmd = '%s -f %s -inptype 1 -a %s' %(self.basecmd, pepfile , allele)
        if show_cmd is True:
            print (cmd)
        try:
            temp = subprocess.check_output(cmd, shell=True, executable='/bin/bash')
        except Exception as e:
            #print (e)
            return
        res = self.read_result(temp)

        if len(res)==0:
            print ('empty result, check allele %s is correct' %allele)
            return res
        self.prepare_data(res, name=name)
        return self.data

[docs]    def get_alleles(self):
        """Get available alleles"""

        cmd = 'netMHCpan -listMHC'
        try:
            temp = subprocess.check_output(cmd, shell=True, executable='/bin/bash')
        except:
            print('netmhcpan not installed?')
            return []
        s = temp.decode().split('\n')
        alleles = [i for i in s if not i.startswith('#') and len(i)>0]
        alleles = [self.convert_allele_name(i) for i in alleles]
        return alleles

[docs]    def convert_allele_name(self, a):
        """Convert allele names to internally used form"""

        if not ':' in a and a.startswith('HLA'):
            a = a[:-2]+':'+a[-2:]
        return a

[docs]    def check_install(self):

        cmd = '%s -h' %self.basecmd
        try:
            temp = subprocess.check_output(cmd, shell=True)
            print('netMHCpan appears to be installed')
            return 1
        except:
            return 0

    @classmethod
    def _get_name(self):
        return 'netmhcpan'

[docs]class NetMHCIIPanPredictor(Predictor):
    """netMHCIIpan v3.0 predictor"""

    def __init__(self, data=None):
        Predictor.__init__(self, data=data)
        self.name = 'netmhciipan'
        self.colnames = ['pos','HLA','peptide','Identity','Pos','Core',
                         '1-log50k(aff)','Affinity','Rank']
        self.scorekey = 'score'
        self.cutoff = 500 #.426
        self.operator = '<'
        self.rankascending = 1
        #load precalculated quantiles for sample peptides
        self.qf = self.get_quantile_data()

[docs]    def read_result(self, temp):
        """Read raw results from netMHCIIpan output"""

        ignore = ['Pos','#','Protein','']
        cols = self.colnames
        res = pd.read_csv(io.BytesIO(temp), comment='#', names=cols, sep='\s+',
                          error_bad_lines=False, skiprows=16).dropna(subset=['peptide'])
        res = res[~res.pos.isin(ignore)]
        return res

[docs]    def prepare_data(self, df, name):
        """Prepare netmhciipan results as a dataframe"""

        #df = df.convert_objects(convert_numeric=True)
        df = df.apply( lambda x: pd.to_numeric(x, errors='ignore').dropna())
        df['name'] = name
        df.rename(columns={'Core': 'core','HLA':'allele'}, inplace=True)
        df = df.drop(['Pos','Identity','Rank'],1)
        df['allele'] = df.allele.apply( lambda x: self.convert_allele_name(x) )
        df['score'] = pd.to_numeric(df.Affinity,errors='coerce')
        df['pos'] = df.index.copy()
        df = df.dropna()
        self.get_ranking(df)
        self.data = df
        return

[docs]    def predict(self, peptides, allele='HLA-DRB1*0101', name='temp',
                pseudosequence=None, show_cmd=False, **kwargs):
        """Call netMHCIIpan command line."""

        #assume allele names are in standard format HLA-DRB1*0101
        #if allele.startswith('HLA-'):
        #    allele=allele[4:]
        allele = self.allele_mapping(allele)
        #print (allele)
        #write peptides to temp file
        pepfile = tempfile.mktemp()+'.pep'
        with open(pepfile ,'w') as f:
            for p in peptides:
                f.write(p+'\n')
        f.close()

        cmd = 'netMHCIIpan -f %s -inptype 1 -a %s' %(pepfile , allele)
        if show_cmd is True:
            print (cmd)
        try:
            temp = subprocess.check_output(cmd, shell=True, executable='/bin/bash')
        except Exception as e:
            print (e)
            return
        rows = self.read_result(temp)
        res = pd.DataFrame(rows)
        if len(res)==0:
            return res
        self.prepare_data(res, name)
        return self.data

[docs]    def allele_mapping(self, allele):
        if allele.startswith('HLA-DQA'):
            return allele.replace('*','').replace(':','')
        else:
            return allele.replace('*','_').replace(':','').replace('HLA-DRB','DRB')

[docs]    def get_alleles(self):
        """Get available alleles"""

        cmd = 'netMHCIIpan -list'
        try:
            temp = subprocess.check_output(cmd, shell=True, executable='/bin/bash')
        except:
            print('netmhciipan not installed?')
            return []
        alleles = temp.decode().split('\n')#[34:]
        return alleles

[docs]    def convert_allele_name(self, a):
        """Convert allele names to internally used form"""

        if not a.startswith('HLA'):
            return 'HLA-'+a.replace(':','')
        else:
            return a.replace(':','')

[docs]    def check_install(self):
        cmd = 'netMHCIIpan'
        try:
            temp = subprocess.check_output(cmd, shell=True)
            print('netMHCIIpan appears to be installed')
            return 1
        except:
            print ('netMHCIIpan not found')
            return 0

    @classmethod
    def _get_name(self):
        return 'netmhciipan'

[docs]class IEDBMHCIPredictor(Predictor):
    """Using IEDB tools method, requires iedb-mhc1 tools. Tested with version 2.17"""

    def __init__(self, data=None, method='IEDB_recommended'):
        Predictor.__init__(self, data=data)
        self.name = 'iedbmhc1'
        self.methods = ['ann', 'IEDB_recommended', 'comblib_sidney2008',
                        'consensus', 'smm', 'netmhcpan', 'smmpmbec']
        self.scorekey = 'ic50'
        self.cutoff = 500
        self.operator = '<'
        self.rankascending = 1
        self.method = method
        self.iedbmhc1_path = defaults['iedbmhc1_path']
        self.qf = self.get_quantile_data()
        return

[docs]    def predict(self, sequence=None, peptides=None, length=11, overlap=1,
                   allele='HLA-A*01:01', name='', method=None, show_cmd=False, **kwargs):
        """Use IEDB MHCI python module to get predictions.
           Requires that the IEDB MHC tools are installed locally
           Args:
            sequence: a sequence to be predicted
            peptides: a list of arbitrary peptides instead of single sequence
           Returns:
            pandas dataframe
           """

        if sequence is not None:
            tempf = os.path.join(self.temppath, name+'.fa')
            seqfile = write_fasta(sequence, id=name, filename=tempf)
        elif peptides is not None:
            if type(peptides) is not pd.DataFrame:
                peptides = self.seqs_to_dataframe(peptides)
            #print (peptides[:3])
            tempf = tempfile.mktemp()+'.txt'
            seqfile = sequtils.dataframe_to_fasta(peptides, outfile=tempf, seqkey='peptide', idkey='name')
            length = peptides.peptide.str.len().max()
        else:
            return

        if not os.path.exists(self.iedbmhc1_path):
            print ('IEDB MHC-I tools not found, set the iedbmhc1path variable')
            return
        if method == None:
            method = self.method
        if method not in self.methods:
            print ('available methods are %s' %self.methods)
            return
        self.method = method

        cmd = os.path.join(self.iedbmhc1_path,'src/predict_binding.py')
        cmd = cmd+' %s %s %s %s' %(method,allele,length,seqfile)
        if show_cmd == True:
            print (cmd)
        from subprocess import Popen, PIPE
        try:
            p = Popen(cmd, stdout=PIPE, shell=True)
            temp,error = p.communicate()
            #print (temp)
        except OSError as e:
            print (e)
            return
        #print (temp)
        df = self.prepare_data(temp, name)
        return df

[docs]    def prepare_data(self, rows, name):
        """Prepare data from results"""

        try:
            df = pd.read_csv(io.BytesIO(rows),sep="\t")
        except:
            #print ('no output to read')
            return
        #print (df)
        if len(df)==0:
            print (rows) #should print error string from output
            return
        df = df.replace('-',np.nan)
        df = df.dropna(axis=1,how='all')
        df.rename(columns={'start':'pos'},
                           inplace=True)
        df=df.drop(columns=['seq_num','end'])
        df['core'] = df.peptide
        df['name'] = name
        if 'method' not in df.columns:
            df['method'] = self.method
        if self.method in ['IEDB_recommended','consensus']:
            df['ic50'] = df.filter(regex="ic50").mean(1)
        if not 'score' in df.columns:
            df['score'] = df.ic50#.apply( lambda x: 1-math.log(x, 50000))
        self.get_ranking(df)
        self.data = df
        #print (df[:10])
        return df

[docs]    def get_alleles(self):
        """Get available alleles from model_list file and
            convert to standard names"""

        alleles = list(pd.read_csv(os.path.join(datadir, 'iedb_mhc1_alleles.csv')).allele.values)
        #alleles = sorted(list(set([get_standard_mhc1(i) for i in alleles])))
        return alleles

[docs]    def get_allele_data(self):
        path = self.iedbmhc1_path
        if not os.path.exists(path):
            print ('iedb tools not found')
            return
        c = os.path.join(path,'src/predict_binding.py')
        for m in self.methods:
            cmd = c + ' %s mhc' %m
            temp = subprocess.check_output(cmd, shell=True, executable='/bin/bash')
            print (temp)
        return

[docs]    def check_install(self):
        path = self.iedbmhc1_path
        if not os.path.exists(path):
            print ('iedb mhcII tools not found')
            return
        cmd = os.path.join(path,'src/predict_binding.py')
        try:
            temp = subprocess.check_output(cmd, shell=True)
            print('IEDB MHC-I tools appears to be installed')
            return 1
        except:
            return 0

    @classmethod
    def _get_name(self):
        return 'iedbmhc1'

[docs]class IEDBMHCIIPredictor(Predictor):
    """Using IEDB MHC-II method, requires tools to be installed locally"""

    def __init__(self, data=None):
        Predictor.__init__(self, data=data)
        self.name = 'iedbmhc2'
        #score will store ic50 value for most methods except sturniolo
        self.scorekey = 'score'
        self.cutoff = 500
        self.operator = '<'
        self.rankascending = 1
        self.methods = ['comblib','consensus3','IEDB_recommended',
                        'NetMHCIIpan','nn_align','smm_align','sturniolo']
        self.method = 'IEDB_recommended'
        self.iedbmhc2_path = defaults['iedbmhc2_path']
        self.qf = self.get_quantile_data()
        return

[docs]    def prepare_data(self, rows, name):
        """Read data from raw output"""

        if len(rows) == 0:
            return

        df = pd.read_csv(io.BytesIO(rows),sep=r'\t',engine='python',index_col=False)
        #print (df.columns)
        extracols = ['Start','End','comblib_percentile','smm_percentile','nn_percentile',
                     'Sturniolo core',' Sturniolo score',' Sturniolo percentile']
        df.reset_index(inplace=True)
        df.rename(columns={'index':'pos','Sequence': 'peptide','Allele':'allele'},
                           inplace=True)
        df['core'] = df[df.filter(regex="core").columns[0]]
        df['name'] = name

        if 'method' not in df.columns:
            df['method'] = self.method
        #print (df.loc[0])
        method = df.loc[0].method
        if method == 'Sturniolo':
            df['score'] = df.sturniolo_score
        else:
            df['ic50'] = df.filter(regex="ic50").mean(1)
        if not 'score' in df.columns:
            df['score'] = df.ic50#.apply( lambda x: 1-math.log(x, 50000))

        self.get_ranking(df)
        self.data = df
        return df

[docs]    def predict(self, sequence=None, peptides=None, length=15, overlap=None, show_cmd=False,
                   allele='HLA-DRB1*01:01', method='IEDB_recommended', name='', **kwargs):
        """Use IEDB MHC-II python module to get predictions. Requires that the IEDB MHC-II
        tools are installed locally. A sequence argument is provided since the cmd line
        only accepts whole sequence to be fragmented.
        """

        self.method = method
        if sequence is not None:
            seqfile = tempfile.mktemp()+'.fa'
            write_fasta(sequence, id=name, filename=seqfile)
        elif peptides is not None:
            if type(peptides) is not pd.DataFrame:
                peptides = self.seqs_to_dataframe(peptides)
            #print (peptides[:3])
            tempf = tempfile.mktemp()+'.txt'
            seqfile = sequtils.dataframe_to_fasta(peptides, outfile=tempf, seqkey='peptide', idkey='name')
            length = peptides.peptide.str.len().max()
        else:
            return

        path = self.iedbmhc2_path
        if method == None: method = 'IEDB_recommended'
        if not os.path.exists(path):
            print ('iedb MHC-II tools not found')
            return
        cmd = os.path.join(path,'mhc_II_binding.py')
        cmd = cmd+' %s %s %s' %(method,allele,seqfile)
        if show_cmd is True:
            print (cmd)
        try:
            #temp = subprocess.check_output(cmd, shell=True)
            from subprocess import Popen, PIPE, STDOUT
            p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
            temp = p.stdout.read()
            #print (temp)
        except Exception as e:
            print (e)
            print ('check allele %s is correct.' %allele)
            return

        data = self.prepare_data(temp, name)
        return data

[docs]    def get_alleles(self):
        alleles = list(pd.read_csv(os.path.join(datadir, 'iedb_mhc2_alleles.csv')).allele.values)
        return alleles

[docs]    def check_install(self):
        path = self.iedbmhc2_path
        if not os.path.exists(path):
            print ('iedb mhcII tools not found')
            return
        cmd = os.path.join(path,'mhc_II_binding.py')
        try:
            temp = subprocess.check_output(cmd, shell=True)
            print('IEDB MHC-II tools appears to be installed')
            return 1
        except:
            return 0

    @classmethod
    def _get_name(self):
        return 'iedbmhc2'

[docs]class TEpitopePredictor(Predictor):
    """Predictor using TepitopePan QM method"""

    def __init__(self, data=None, **kwargs):
        Predictor.__init__(self, data=data)
        self.name = 'tepitope'
        self.pssms = tepitope.get_pssms()
        self.cutoff = 2
        self.operator = '>'
        self.rankascending = 0
        #load precalculated quantiles for sample peptides
        self.qf = self.get_quantile_data()

[docs]    def predict(self, peptides=None, length=9, overlap=1,
                 allele='HLA-DRB1*0101', name='', pseudosequence=None,
                 **kwargs):

        if length not in [9,11,12,15]:
            print ('you should use lengths of 9 or 11, 13 or 15.')
        allele = allele.replace(':','')
        if not allele in self.pssms:
            #print 'computing virtual matrix for %s' %allele
            m = tepitope.create_virtual_pssm(allele)
            if m is None:
                print ('no such allele', allele)
                return pd.DataFrame()
        else:
            m = self.pssms[allele]
        m = m.transpose().to_dict()
        result = tepitope.get_scores(m, peptides=peptides)
        df = self.prepare_data(result, name, allele)
        self.data = df
        #print(df[:5])
        return df

[docs]    def supported_lengths(self):
        """Return supported peptide lengths"""
        return [9,11,13,15]

[docs]    def get_alleles(self):
        return tepitope.get_alleles()

[docs]    def check_alleles(self, alleles):
        alleles = [i.replace(':','') for i in alleles]
        a = self.get_alleles()
        found = list((set(a) & set(alleles)))
        return found

    @classmethod
    def _get_name(self):
        return 'tepitope'

'''class IEDBBCellPredictor(Predictor):
    """Using IEDB tools methods, requires iedb bcell tools.
       see http://tools.immuneepitope.org/bcell """

    def __init__(self, data=None):
        Predictor.__init__(self, data=data)
        self.name = 'iedbbcell'
        self.scorekey = 'Score'
        self.methods = ['Chou-Fasman', 'Emini', 'Karplus-Schulz',
                        'Kolaskar-Tongaonkar', 'Parker', 'Bepipred']
        self.cutoff = 0.9
        self.operator = '>'
        self.rankascending = 0
        self.method = 'Bepipred'
        self.path = iedbbcellpath

    def predict(self, sequence=None, peptides=None, window=None, name=''):
        """Uses code from iedb predict_binding.py """

        value = self.method
        currpath=os.getcwd()
        os.chdir(self.path)
        sys.path.append(self.path)
        from src.BCell import BCell
        bcell = BCell()
        filepath = os.path.join(self.path,'bcell_scales.pickle')
        picklefile = open(filepath, 'rb')
        scale_dict = pickle.load(picklefile)
        bcell.scale_dict = scale_dict[value]
        if window==None:
            window = bcell.window
        center = "%d" %round(int(window)/2.0)
        if value == 'Emini':
            results = bcell.emini_method(value, sequence, window, center)
        elif value == 'Karplus-Schulz':
            results = bcell.karplusshulz_method(value, sequence, window, center)
        elif value == 'Kolaskar-Tongaonkar':
            results = bcell.kolaskartongaonkar_method(value, sequence, window, center)
        elif value == 'Bepipred':
            results = bcell.bepipred_method(value, sequence, window, center)
        else:
            results = bcell.classical_method(value, sequence, window, center)

        threshold = round(results[1][0], 3)
        temp=results[0]
        self.prepare_data(temp, name)
        os.chdir(currpath)
        return self.data

    def prepare_data(self, temp, name):

        df = pd.read_csv(temp,sep=",")
        if len(df)==0:
            return
        #df = df.replace('-',np.nan)
        df = df.dropna(axis=1,how='all')
        #df.reset_index(inplace=True)
        df['name'] = name
        self.data = df
        #print (df)
        return

    def predict_sequences(self, recs, names=None, save=False,
                        label='', path='', **kwargs):
        """Get predictions for a set of proteins - no alleles so we override
        the base method for this too. """

        recs = sequtils.get_cds(recs)
        if names != None:
            recs = recs[recs.locus_tag.isin(names)]
        proteins = list(recs.iterrows())
        res=[]
        for i,row in proteins:
            seq = row['translation']
            name = row['locus_tag']
            #print (name)
            df = self.predict(sequence=seq,name=name)
            res.append(df)
            if save == True:
                #fname = os.path.join(path, name+'.mpk')
                #pd.to_msgpack(fname, res)
                fname = os.path.join(path, name+'.csv')
                df.to_csv(fname)
        self.data = res = pd.concat(res)
        return'''

[docs]class MHCFlurryPredictor(Predictor):
    """
    Predictor using MHCFlurry for MHC-I predictions. Requires you to
    install the python package mhcflurry with dependencies.
    see https://github.com/hammerlab/mhcflurry
    """

    def __init__(self, data=None, **kwargs):
        Predictor.__init__(self, data=data)
        self.name = 'mhcflurry'
        self.cutoff = 500
        self.operator = '<'
        self.scorekey = 'score'
        self.rankascending = 1
        if self.check_install() == False:
            return
        self._check_models()

        #load precalculated quantiles for sample peptides
        self.qf = self.get_quantile_data()
        return

[docs]    def predict(self, peptides=None, overlap=1, show_cmd=False,
                      allele='HLA-A0101', name='', **kwargs):
        """Uses mhcflurry python classes for prediction"""

        p =' '.join(peptides)
        tmp = tempfile.mkstemp()[1]
        try:
            cmd = 'mhcflurry-predict --alleles {a} --peptides {p} --out {t}'.format(a=allele,p=p,t=tmp)
            if show_cmd==True:
                print (cmd)
            subprocess.check_output(cmd, shell=True)
        except Exception as e:
            print ('failed to predict for allele %s' %allele)
            print (e)
            return
        df = pd.read_csv(tmp)
        df['score'] = df.mhcflurry_affinity
        df['pos'] = df.index+1
        df['name'] = name
        self.get_ranking(df)
        self.data = df
        os.remove(tmp)
        return df

[docs]    def convert_allele_name(self, r):
        if ':' not in r:
            return r[:5]+'*'+r[5:7]+':'+r[7:]
        else:
            return r[:5]+'*'+r[5:7]+r[7:]

[docs]    def get_alleles(self):
        with open(os.path.join(datadir,'mhcflurry_alleles.txt')) as f:
            p = f.readlines()
        p = [x.strip() for x in p]
        p = list(filter(None, p))
        return p

[docs]    def check_install(self):
        try:
            from mhcflurry import Class1AffinityPredictor
            return True
        except:
            #print ('use pip install mhcflurry to install.')
            return False

    def _check_models(self):
        try:
            import mhcflurry
            mhcflurry.class1_affinity_predictor.get_default_class1_models_dir()
        except:
            print ('first use. downloading class-I models')
            subprocess.check_output('mhcflurry-downloads fetch models_class1', shell=True)
            return True
        return False

[docs]    def predict_sequences(self, recs, **kwargs):
        """Override so we can switch off multi threading."""

        kwargs['threads'] = 1
        results = Predictor.predict_sequences(self, recs, **kwargs)
        return results

[docs]    def predict_peptides(self, peptides, **kwargs):
        """Override so we can call train models before predictions."""

        kwargs['threads'] = 1
        data = Predictor.predict_peptides(self, peptides, **kwargs)
        return data

    @classmethod
    def _get_name(self):
        return 'mhcflurry'

'''class MHCNuggetsPredictor(Predictor):
    """
    Predictor using MHCNuggets for MHC-I predictions. Requires you to
    install the package locally from https://github.com/KarchinLab/mhcnuggets
    see https://www.biorxiv.org/content/early/2017/07/27/154757
    """

    def __init__(self, data=None):
        Predictor.__init__(self, data=data)
        self.name = 'mhcnuggets'
        self.cutoff = 500
        self.operator = '<'
        self.scorekey = 'ic50'
        self.rankascending = 1
        self.qf = self.get_quantile_data()
        return

    def predict(self, peptides=None, overlap=1,
                      allele='HLA-A01:01', name='temp', **kwargs):
        """Uses cmd line call to mhcnuggets."""

        allele = allele.replace('*','')
        from mhcnuggets.src.predict import predict
        pepfile = self.write_seqs(peptides)
        outfile = tempfile.mktemp()+'.txt'
        #print(outfile)
        predict(class_='I', peptides_path=pepfile, mhc=allele, output=outfile)
        res = pd.read_csv(outfile)
        df = self.prepare_data(res, name, allele)
        return df

    def prepare_data(self, df, name, allele):
        """Get result into dataframe"""

        df['name'] = name
        df['allele'] = allele
        df['pos'] = df.index.copy()
        self.get_ranking(df)
        self.data = df
        return df

    def write_seqs(self, peptides):
        tempf = tempfile.mktemp()+'.pep'
        f = open(tempf,'w')
        for p in peptides:
            f.write("%s\n" % p)
        f.close()
        return tempf

    def get_alleles(self):
        with open(os.path.join(datadir,'mhcnuggets_alleles.txt')) as f:
            p = f.readlines()
        p = [x.strip() for x in p]
        p = list(filter(None, p))
        return p

    @classmethod
    def _get_name(self):
        return 'mhcnuggets'
'''

[docs]class DummyPredictor(Predictor):
    """Returns random scores. Used for testing"""
    def __init__(self, data=None, scoring=None):
        Predictor.__init__(self, data=data)
        self.name = 'null'
        self.scorekey = 'score'
        self.cutoff = 500
        self.operator = '<'
        self.rankascending = 1

[docs]    def predict(self, peptides, allele='HLA-A*01:01', name='temp',
             **kwargs):

        sc = np.random.randint(0,1,len(peptides))
        res = pd.DataFrame(np.column_stack([peptides,sc]),columns=['peptide','log50k'])
        res['score'] = res.log50k.astype('float').apply(lambda x: log50k2aff(x))
        df = self.prepare_data(res,name,allele)
        return df

    @classmethod
    def _get_name(self):
        return 'dummy'

[docs]class BasicMHCIPredictor(Predictor):
    """Built-in basic MHC-I predictor. Should be used as a fallback if no other
       predictors available."""

    def __init__(self, data=None, scoring=None):
        Predictor.__init__(self, data=data)
        self.name = 'basicmhc1'
        self.scorekey = 'score'
        self.cutoff = 500
        self.operator = '<'
        self.rankascending = 1
        self.qf = self.get_quantile_data()
        self.encoder = mhclearn.one_hot_encode
        return

[docs]    def predict(self, peptides, allele='HLA-A*01:01', name='temp', **kwargs):
        """Encode and predict peptides with saved regressor"""

        length = len(peptides[0])
        if length <9 and length>11:
            return
        encoder = self.encoder
        if type(peptides) is list:
            s = pd.Series(peptides)
        else:
            s = peptides
        #encode
        X = s.apply(lambda x: pd.Series(encoder(x)),1)
        reg = mhclearn.get_model(allele, length)
        if reg is None:
            print ('no model for this allele')
            return
        sc = reg.predict(X).round(4)
        res = pd.DataFrame(np.column_stack([peptides,sc]),columns=['peptide','log50k'])
        res['log50k'] = res.log50k.astype('float')
        res['score'] = res.log50k.apply(lambda x: mhclearn.log50k2aff(x)).round(2)
        #print(res.dtypes)
        df = self.prepare_data(res,name,allele)
        return df

[docs]    def prepare_data(self, df, name, allele):
        """Put raw prediction data into DataFrame and rank,
           override for custom processing. Can be overriden for
           custom data."""

        df['name'] = name
        df['pos'] = df.index+1
        df['allele'] = allele
        self.get_ranking(df)
        return df

[docs]    def supported_lengths(self):
        """Return supported peptide lengths"""
        return [9,10,11]

[docs]    def get_alleles(self):
        p = mhclearn.get_allele_names()
        return p

[docs]    def check_install(self):

        reg = mhclearn.build_predictor('HLA-A*01:01')
        if reg is None:
            return False
        return True

[docs]    def predict_peptides(self, peptides, **kwargs):
        """Override so we can call train models before predictions."""

        mhclearn.train_models(alleles=kwargs['alleles'], encoder=self.encoder)
        data = Predictor.predict_peptides(self, peptides, **kwargs)
        return data

[docs]    def predict_sequences(self, recs, **kwargs):
        """Override so we can call train models before predictions."""

        mhclearn.train_models(alleles=kwargs['alleles'], encoder=self.encoder)
        results = Predictor.predict_sequences(self, recs, **kwargs)
        return results

    @classmethod
    def _get_name(self):
        return 'basicmhc1'