Source code for epitopepredict.analysis

#!/usr/bin/env python

"""
    epitopepredict analysis methods
    Created September 2013
    Copyright (C) Damien Farrell
    This program is free software; you can redistribute it and/or
    modify it under the terms of the GNU General Public License
    as published by the Free Software Foundation; either version 3
    of the License, or (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
"""

from __future__ import absolute_import, print_function
import sys, os, shutil, string, types
import csv, glob, pickle, itertools
import math
import re
import time, random
from collections import OrderedDict
from operator import itemgetter
import numpy as np
import pandas as pd
import subprocess
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
from . import base, sequtils, tepitope, utilities, peptutils

home = os.path.expanduser("~")
#fix paths!
genomespath = os.path.join(home, 'epitopedata')
datadir = os.path.join(home, 'testpredictions')

[docs]def get_AAcontent(df, colname, amino_acids=None):
    """Amino acid composition for dataframe with sequences"""
    return df.apply( lambda r: peptutils.get_AAfraction(str(r[colname]), amino_acids), 1)

[docs]def net_charge(df, colname):
    """Net peptide charge for dataframe with sequences"""
    return df.apply( lambda r: peptutils.net_charge(r[colname]),1)

[docs]def isoelectric_point(df):
    def getpi(seq):
        X = ProteinAnalysis(seq)
        return X.isoelectric_point()
    return df.apply( lambda r: getpi(r.peptide),1)

[docs]def peptide_properties(df, colname='peptide'):
    """Find hydrophobicity and net charge for peptides"""

    df['hydro'] = get_AAcontent(df, colname)
    df['net_charge'] = net_charge(df, colname)
    return df

def _center_nmer(x, n):
    """Get n-mer sequence for a peptide centered in the middle.
    This should be applied to a dataframe per row.
    Returns: a single sequence centred on the peptide

    """

    seq = x['translation']
    size = x.end-x.start
    l = int((size-n)/2.0)
    if size>n:
        if size%2 == 1: l1 = l+1
        else: l1=l
        start = x.start+l1
        end = x.end-l
    elif size<=n:
        if size%2 == 1: l1 = l-1
        else: l1=l
        start = x.start+l1
        end = x.end-l
    if start<=0:
        d=1-start
        start = start+d
        end = end+d
    seq = seq[start:end]
    #print(size, x.peptide, x.start, x.end, l, l1, start, end, seq, len(seq))
    return seq

def _split_nmer(x, n, key, margin=3, colname='peptide'):
    """Row based method to split a peptide in to multiple n-mers
        if it's too large. Returns a dataframe of 3 cols so should be
        applied using iterrows and then use concat.

    Args:
        x: row item
        n: length to split on
        key:
    """

    size = x.end-x.start
    m = margin

    if size <= n+m:
        seq = _center_nmer(x, n)
        return pd.DataFrame({colname: seq},index=[0])
    else:
        seq = x[key]
        o=size%n
        #print (size, o)
        if o<=margin:
            size=size-o
            seq = _center_nmer(x, size)
            #print (size)
        seqs=[]
        seq = x[key][x.start:x.end]
        if x.start==0: s=1
        else: s=0
        for i in range(s, size, n):
            if i+n>size:
                seqs.append(seq[o:o+n])
                #print (x.name,seq[x.start:x.start+n])
            else:
                seqs.append(seq[i:i+n])
                #print (seq[i:i+n])
        seqs = pd.Series(seqs)
        d = pd.DataFrame({colname:seqs})
        return d

[docs]def create_nmers(df, genome, length=20, seqkey='translation', key='nmer', how='split', margin=0):
    """Get n-mer peptide surrounding a set of sequences using the host
        protein sequence.

    Args:
        df: input dataframe with sequence name and start/end coordinates
        genome: genome dataframe with host sequences
        length: length of nmer to return
        seqkey: column name of sequence to be processed
        how: method to create the n-mer, split will try to split up
            the sequence into overlapping n-mes of length is larger than size
            center will center the peptide
        margin: do not split sequences below length+margin
    Returns:
        pandas Series with nmer values
    """

    cols = ['locus_tag','gene','translation']
    cols = list(set(cols) & set(genome.columns))
    #merge with genome dataframe but must keep index for re-merging
    if len(df)==0:
        return
    temp = df.merge(genome[cols],left_on='name',right_on='locus_tag',
                    how='left')#.set_index(df.index)
    #print (temp)
    if not 'end' in list(temp.columns):
        temp = base.get_coords(temp)
    #temp = base.get_coords(temp)

    if how == 'center':
        temp[key] = temp.apply( lambda r: _center_nmer(r, length), 1)
        res = temp
    elif how == 'split':
        res=[]
        for n,r in temp.iterrows():
            d = _split_nmer(r, length, seqkey, margin, key)
            d['index']=n
            d.set_index('index',inplace=True)
            res.append(d)
        res = pd.concat(res)
        #print (res)
        res = temp.merge(res,left_index=True,right_index=True,how='right').reset_index(drop=True)
        #print (res)
    res=res.drop([seqkey],1)
    return res

[docs]def get_overlaps(df1, df2, label='overlap', how='inside'):
    """
    Overlaps for 2 sets of sequences where the positions in host sequence are stored
    in each dataframe as 'start' and 'end' columns

    Args:
        df1 : first set of sequences, a pandas dataframe with columns called
                start/end or pos
        df2: second set of sequences
        label: label for overlaps column
        how: may be 'any' or 'inside'
    Returns:
        First DataFrame with no. of overlaps stored in a new column
    """

    new=[]
    a = base.get_coords(df1)
    b = base.get_coords(df2)

    def overlap(x,y):
        f=0
        #print x['name'],x.peptide
        #print (x.start,x.end)
        for i,r in y.iterrows():
            if how == 'inside':
                if ((x.start<=r.start) & (x.end>=r.end)):
                    f+=1
            elif how == 'any':
                if ((x.start<=r.start) & (x.end>r.start)) or \
                   ((x.start>=r.start) & (x.start<r.end)):
                    #t = abs(r.start-x.start)
                    #print (a, b)
                    f+=1
            #print (r.start,r.end, f)
        return f

    for n,df in a.groupby('name'):
        found = b[b.name==n]
        df[label] = df.apply(lambda r: overlap(r,found),axis=1)
        new.append(df)
    result = pd.concat(new)
    #print ('%s with overlapping sequences' %len(result[result[label]>0]))
    return result

[docs]def get_orthologs(seq, db=None, expect=1, hitlist_size=400, equery=None,
                  email=''):
    """
    Fetch orthologous sequences using remote or local blast and return the records
    as a dataframe.

    Args:
        seq: sequence to blast
        db: the name of a local blast db
        expect: expect value
        equery: Entrez Gene Advanced Search options,
                (see http://www.ncbi.nlm.nih.gov/books/NBK3837/)
    Returns:
        blast results in a pandas dataframe
    """

    from Bio.Blast import NCBIXML,NCBIWWW
    from Bio import Entrez, SeqIO
    Entrez.email = email

    print ('running blast..')
    if db != None:
        #local blast
        SeqIO.write(SeqRecord(Seq(seq)), 'tempseq.faa', "fasta")
        sequtils.local_blast(db, 'tempseq.faa', output='my_blast.xml', maxseqs=100)
        result_handle = open("my_blast.xml")
        df = sequtils.get_blast_results(result_handle)
    else:
        try:
            result_handle = NCBIWWW.qblast("blastp", "nr", seq, expect=expect,
                                  hitlist_size=500,entrez_query=equery)
            time.sleep(2)
            savefile = open("my_blast.xml", "w")
            savefile.write(result_handle.read())
            savefile.close()
            result_handle = open("my_blast.xml")
            df = sequtils.get_blast_results(result_handle, local=False)
        except Exception as e:
            print ('blast timeout')
            return

    df = df.drop(['subj','positive','query_length','score'],1)
    df.drop_duplicates(subset=['definition','perc_ident'], inplace=True)
    df = df[df['perc_ident']!=100]
    return df

[docs]def alignment_to_dataframe(aln):
    alnrows = [[str(a.id),str(a.seq)] for a in aln]
    df = pd.DataFrame(alnrows,columns=['accession','seq'])
    return df

[docs]def align_blast_results(df, aln=None, idkey='accession', productkey='definition'):
    """
    Get gapped alignment from blast results using muscle aligner.
    """

    sequtils.dataframe_to_fasta(df, idkey=idkey, seqkey='sequence',
                        descrkey=productkey, outfile='blast_found.faa')
    aln = sequtils.muscle_alignment("blast_found.faa")
    alnrows = [[a.id,str(a.seq)] for a in aln]
    alndf = pd.DataFrame(alnrows,columns=['accession','seq'])
    #res = df.merge(alndf, left_index=True, right_index=True)
    res = df.merge(alndf, on=['accession'])
    res = res.drop('sequence',1)
    #get rid of duplicate hits
    #res.drop_duplicates(subset=['definition','seq'], inplace=True)
    res = res.sort_values(by='identity',ascending=False)
    print ('%s hits, %s filtered' %(len(df), len(res)))
    return res, aln

[docs]def get_species_name(s):
    """Find [species name] in blast result definition"""

    m = re.search(r"[^[]*\[([^]]*)\]", s)
    if m == None:
        return s
    return m.groups()[0]

[docs]def find_conserved_sequences(seqs, alnrows):
    """
    Find if sub-sequences are conserved in given set of aligned sequences
    Args:
        seqs: a list of sequences to find
        alnrows: a dataframe of aligned protein sequences
    Returns:
        a pandas DataFrame of 1 or 0 values for each protein/search sequence
    """

    f=[]
    for i,a in alnrows.iterrows():
        sequence = a.seq
        found = [sequence.find(j) for j in seqs]
        f.append(found)
    for n in ['species','accession','name']:
        if n in alnrows.columns:
            ind = alnrows[n]
            break
    s = pd.DataFrame(f,columns=seqs,index=ind)
    s = s.replace(-1,np.nan)
    s[s>0] = 1
    res = s.count()
    return s

[docs]def epitope_conservation(peptides, alnrows=None, proteinseq=None, blastresult=None,
                         blastdb=None, perc_ident=50, equery='srcdb_refseq[Properties]'):
    """
    Find and visualise conserved peptides in a set of aligned sequences.
    Args:
        peptides: a list of peptides/epitopes
        alnrows: a dataframe of previously aligned sequences e.g. custom strains
        proteinseq: a sequence to blast and get an alignment for
        blastresult: a file of saved blast results in plain csv format
        equery: blast query string
    Returns:
        Matrix of 0 or 1 for conservation for each epitope/protein variant
    """

    import seaborn as sns
    sns.set_context("notebook", font_scale=1.4)

    if alnrows is None:
        if proteinseq == None:
            print ('protein sequence to blast or alignment required')
            return
        if blastresult == None or not os.path.exists(blastresult):
            blr = get_orthologs(proteinseq, equery=equery, blastdb=blastdb)
            if blr is None:
                return
            #if filename == None: filename = 'blast_%s.csv' %label
            blr.to_csv(blastresult)
        else:
            blr = pd.read_csv(blastresult, index_col=0)
        #blr = blr[blr.perc_ident>=perc_ident]
        alnrows, aln = align_blast_results(blr)
        #print (sequtils.formatAlignment(aln))

    if 'perc_ident' in alnrows.columns:
        alnrows = alnrows[alnrows.perc_ident>=perc_ident]
    if 'definition' in alnrows.columns:
        alnrows['species'] = alnrows.definition.apply(get_species_name)
    c = find_conserved_sequences(peptides, alnrows).T

    c = c.dropna(how='all')
    c = c.reindex_axis(c.sum(1).sort_values().index)
    if len(c) == 0:
        print ('no conserved epitopes in any sequence')
        return
    return c

def _region_query(P, eps, D):
	neighbour_pts = []
	for point in D:
		if abs(P - point)<eps:
			neighbour_pts.append(point)
	return neighbour_pts

def _expand_cluster(P, neighbour_pts, C, c_n, eps, min_pts, D, visited):

    flatten = lambda l: [i for sublist in l for i in sublist]
    C[c_n].append(P)
    for point in neighbour_pts:
        if point not in visited:
            visited.append(point)
            neighbour_pts_2 = _region_query(point, eps, D)
            if len(neighbour_pts_2) >= min_pts:
                neighbour_pts += neighbour_pts_2
        #print (point,C)
        if point not in flatten(C):
            C[c_n].append(point)

def _dbscan(D, eps=5, minsize=2):
    """
    1D intervals using dbscan. Density-Based Spatial clustering.
    Finds core samples of high density and expands clusters from them.
    """
    from numpy.random import rand
    noise = []
    visited = []
    C = []
    c_n = -1
    for point in D:
        visited.append(point)
        neighbour_pts = _region_query(point, eps, D)
        if len(neighbour_pts) < minsize:
            noise.append(point)
        else:
            C.append([])
            c_n+=1
            _expand_cluster(point, neighbour_pts, C, c_n,eps, minsize, D, visited)

    C = [i for i in C if len(i)>=minsize]
    #for cl in C:
    #    print (cl)
    return C

[docs]def dbscan(B=None, x=None, dist=7, minsize=4):
    """Use dbscan algorithm to cluster binder positions"""

    if B is not None:
        if len(B)==0:
            return
        x = sorted(B.pos.astype('int'))
    clusts = _dbscan(x, dist, minsize)
    #print (clusts)
    return clusts

[docs]def find_clusters(binders, dist=None, min_binders=2, min_size=12, max_size=50,
                    genome=None, colname='peptide'):
    """
    Get clusters of binders for a set of binders.
    Args:
        binders: dataframe of binders
        dist: distance over which to apply clustering
        min_binders : minimum binders to be considered a cluster
        min_size: smallest cluster length to return
        max_size: largest cluster length to return
        colname: name for cluster sequence column
    Returns:
        a pandas Series with the new n-mers (may be longer than the initial dataframe
        if splitting)
    """

    C=[]
    grps = list(binders.groupby('name'))
    length = binders.head(1).peptide.str.len().max()
    #print (length)
    if dist == None:
        dist = length+1
        #print ('using dist for clusters: %s' %dist)
    for n,b in grps:
        if len(b)==0: continue
        clusts = dbscan(b,dist=dist,minsize=min_binders)
        if len(clusts) == 0:
            continue
        for c in clusts:
            gaps = [c[i]-c[i-1] for i in range(1,len(c))]
            C.append([n,min(c),max(c)+length,len(c)])

    if len(C)==0:
        print ('no clusters')
        return pd.DataFrame()
    x = pd.DataFrame(C,columns=['name','start','end','binders'])
    x['length'] = (x.end-x.start)
    x = x[x['length']>=min_size]
    x = x[x['length']<=max_size]
    x=x.sort_values(['binders','length'],ascending=False)

    if genome is not None:
        cols = ['locus_tag','translation']
        if 'gene' in genome.columns:
            cols.append('gene')
        x = x.merge(genome[cols],
                    left_on='name',right_on='locus_tag')
        x[colname] = x.apply(lambda r: r.translation[r.start:r.end], 1)
        x = x.drop(['locus_tag','translation'],1)
        x = x.drop_duplicates(colname)
        x = x.sort_values(by=['binders'],ascending=False)
        x = x.reset_index(drop=True)

    #print ('%s clusters found in %s proteins' %(len(x),len(x.groupby('name'))))
    #print
    return x

[docs]def randomize_dataframe(df, seed=8):
    """Randomize order of dataframe"""

    np.random.seed(seed=seed)
    new = df.reset_index(drop=True)
    new = new.reindex(np.random.permutation(new.index))
    return new

[docs]def save_to_excel(df, n=94, filename='peptide_lists'):
    """
    Save a dataframe to excel with option of writing in chunks.
    """

    writer = pd.ExcelWriter('%s.xls' %filename)
    i=1
    chunks = df.groupby(np.arange(len(df)) // n)
    for g,c in chunks:
        c.to_excel(writer,'list'+str(i))
        i+=1
    writer.save()
    return

[docs]def tmhmm(fastafile=None, infile=None):
    """
    Get TMhmm predictions
    Args:
        fastafile: fasta input file to run
        infile: text file with tmhmm prediction output
    """

    if infile==None:
        tempfile = 'tmhmm_temp.txt'
        cmd = 'tmhmm %s > %s' %(fastafile,tempfile)
        infile = subprocess.check_output(cmd, shell=True, executable='/bin/bash')
    tmpred = pd.read_csv(infile, delim_whitespace=True, comment='#',
                      names=['locus_tag','v','status','start','end'])
    tmpred = tmpred.dropna()
    print ('tmhmm predictions for %s proteins' %len(tmpred.groupby('locus_tag')))
    lengths=[]
    for i,row in tmpred.iterrows():
        if row.status == 'TMhelix':
            lengths.append(row.end-row.start)
    #print np.mean(lengths), np.std(lengths)
    return tmpred

[docs]def signalP(infile=None,genome=None):
    """Get signal peptide predictions"""

    if genome != None:
        seqfile = Genome.genbank2Fasta(genome)
        tempfile = 'signalp_temp.txt'
        cmd = 'signalp -t gram+ -f short %s > %s' %(seqfile,tempfile)
        infile = subprocess.check_output(cmd, shell=True, executable='/bin/bash')
    sp = pd.read_csv(infile,delim_whitespace=True,comment='#',skiprows=2,
                      names=['locus_tag','Cmax','cpos','Ymax','ypos','Smax',
                            'spos','Smean','D','SP','Dmaxcut','net'])
    #print sp[sp.SP=='Y']
    return sp

[docs]def get_seqdepot(seq):
    """Fetch seqdepot annotation for sequence"""

    from epitopepredict import seqdepot
    reload(seqdepot)
    sd = seqdepot.new()
    aseqid = sd.aseqIdFromSequence(seq)
    try:
        result = sd.findOne(aseqid)
    except Exception as e:
        print (e)
        result=None
    return result

[docs]def prediction_coverage(expdata, binders, key='sequence', perc=50, verbose=False):
    """
    Determine hit rate of predictions in experimental data
    by finding how many top peptides are needed to cover % positives
    Args:
        expdata: dataframe of experimental data with peptide sequence and name column
        binders: dataframe of ranked binders created from predictor
        key: column name in expdata for sequence
    Returns:
        fraction of predicted binders required to find perc total response
    """

    def getcoverage(data, peptides, key):
        #get coverage for single sequence
        target = math.ceil(len(data)*perc/100.0)
        if verbose == True:
            print (len(data), target)
        #print data[key]
        #print peptides[peptides.isin(data[key])]
        found=[]
        count=0
        for p in peptides:
            for i,r in data.iterrows():
                #print p, r[key]
                if r[key] in found:
                    continue
                if r[key].find(p)!=-1 or p.find(r[key])!=-1:
                    found.append(r[key])
                    if verbose == True:
                        print (count, p, r[key])
                    continue
            count+=1
            if len(found) >= target:
                if verbose == True:
                    print (count, target)
                    print ('--------------')
                return count
        if verbose == True:
            print ('not all sequences found', count, target)
        return count

    total = 0
    for name, data in expdata.groupby('name'):
        peptides = binders[binders.name==name].peptide
        if len(peptides) == 0:
            continue
        if verbose == True: print (name)
        #print (binders[binders.name==name][:5])
        c = getcoverage(data, peptides, key)
        total += c

    #print (total, total/float(len(binders))*100)
    return round(total/float(len(binders))*100,2)

[docs]def test_features():
    """test feature handling"""

    fname = os.path.join(datadir,'MTB-H37Rv.gb')
    df = sequtils.genbank2Dataframe(fname, cds=True)
    df = df.set_index('locus_tag')
    keys = df.index
    name='Rv0011c'
    row = df.ix[name]
    seq = row.translation
    prod = row['product']
    rec = SeqRecord(Seq(seq),id=name,description=prod)
    fastafmt = rec.format("fasta")
    print (fastafmt)
    print (row.to_dict())
    ind = keys.get_loc(name)
    previous = keys[ind-1]
    if ind<len(keys)-1:
        next = keys[ind+1]
    else:
        next=None
    return

[docs]def testrun(gname):

    method = 'tepitope'#'iedbmhc1'#'netmhciipan'
    path='test'
    gfile = os.path.join(genomespath,'%s.gb' %gname)
    df = sequtils.genbank2Dataframe(gfile, cds=True)
    #names = list(df.locus_tag[:1])
    names=['VP24']
    alleles1 = ["HLA-A*02:02", "HLA-A*11:01", "HLA-A*32:07", "HLA-B*15:17", "HLA-B*51:01",
              "HLA-C*04:01", "HLA-E*01:03"]
    alleles2 = ["HLA-DRB1*0101", "HLA-DRB1*0305", "HLA-DRB1*0812", "HLA-DRB1*1196", "HLA-DRB1*1346",
            "HLA-DRB1*1455", "HLA-DRB1*1457", "HLA-DRB1*1612", "HLA-DRB4*0107", "HLA-DRB5*0203"]
    P = base.getPredictor(method)
    P.iedbmethod='IEDB_recommended' #'netmhcpan'
    P.predictProteins(df,length=11,alleles=alleles2,names=names,
                        save=True, path=path)
    f = os.path.join('test', names[0]+'.mpk')
    df = pd.read_msgpack(f)
    P.data=df
    #b = P.get_binders(data=df)
    #print b[:20]
    base.getScoreDistributions(method, path)
    return

[docs]def test_conservation(label,gname):
    """Conservation analysis"""

    tag='VP24'
    pd.set_option('max_colwidth', 800)
    gfile = os.path.join(genomespath,'%s.gb' %gname)
    g = sequtils.genbank2Dataframe(gfile, cds=True)
    res = g[g['locus_tag']==tag]
    seq = res.translation.head(1).squeeze()
    print (seq)
    #alnrows = getOrthologs(seq)
    #alnrows.to_csv('blast_%s.csv' %tag)
    alnrows = pd.read_csv('blast_%s.csv' %tag,index_col=0)
    alnrows.drop_duplicates(subset=['accession'], inplace=True)
    alnrows = alnrows[alnrows['perc_ident']>=60]
    seqs=[SeqRecord(Seq(a.sequence),a.accession) for i,a in alnrows.iterrows()]
    print (seqs[:2])
    sequtils.distanceTree(seqs=seqs)#,ref=seqs[0])
    #sequtils.ETETree(seqs, ref, metric)
    #df = sequtils.getFastaProteins("blast_found.faa",idindex=3)
    '''method='tepitope'
    P = base.getPredictor(method)
    P.predictSequences(df,seqkey='sequence')
    b = P.get_binders()'''
    return

[docs]def find_conserved_peptide(peptide, recs):
    """Find sequences where a peptide is conserved"""

    f=[]
    for i,a in recs.iterrows():
        seq = a.sequence.replace('-','')
        found = seq.find(peptide)
        f.append(found)
    s = pd.DataFrame(f,columns=['found'],index=recs.accession)
    s = s.replace(-1,np.nan)
    #print s
    res = s.count()
    return s

[docs]def test():
    gname = 'ebolavirus'
    label = 'test'

    testrun(gname)
    #testBcell(gname)
    #testgenomeanalysis(label,gname,method)
    #testconservation(label,gname)
    #testFeatures()
    return

if __name__ == '__main__':
    pd.set_option('display.width', 600)
    test()