#!/usr/bin/env python
"""
epitopepredict analysis methods
Created September 2013
Copyright (C) Damien Farrell
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 3
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
from __future__ import absolute_import, print_function
import sys, os, shutil, string, types
import csv, glob, pickle, itertools
import math
import re
import time, random
from collections import OrderedDict
from operator import itemgetter
import numpy as np
import pandas as pd
import subprocess
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
from . import base, sequtils, tepitope, utilities, peptutils
home = os.path.expanduser("~")
#fix paths!
genomespath = os.path.join(home, 'epitopedata')
datadir = os.path.join(home, 'testpredictions')
[docs]def get_AAcontent(df, colname, amino_acids=None):
"""Amino acid composition for dataframe with sequences"""
return df.apply( lambda r: peptutils.get_AAfraction(str(r[colname]), amino_acids), 1)
[docs]def net_charge(df, colname):
"""Net peptide charge for dataframe with sequences"""
return df.apply( lambda r: peptutils.net_charge(r[colname]),1)
[docs]def isoelectric_point(df):
def getpi(seq):
X = ProteinAnalysis(seq)
return X.isoelectric_point()
return df.apply( lambda r: getpi(r.peptide),1)
[docs]def peptide_properties(df, colname='peptide'):
"""Find hydrophobicity and net charge for peptides"""
df['hydro'] = get_AAcontent(df, colname)
df['net_charge'] = net_charge(df, colname)
return df
def _center_nmer(x, n):
"""Get n-mer sequence for a peptide centered in the middle.
This should be applied to a dataframe per row.
Returns: a single sequence centred on the peptide
"""
seq = x['translation']
size = x.end-x.start
l = int((size-n)/2.0)
if size>n:
if size%2 == 1: l1 = l+1
else: l1=l
start = x.start+l1
end = x.end-l
elif size<=n:
if size%2 == 1: l1 = l-1
else: l1=l
start = x.start+l1
end = x.end-l
if start<=0:
d=1-start
start = start+d
end = end+d
seq = seq[start:end]
#print(size, x.peptide, x.start, x.end, l, l1, start, end, seq, len(seq))
return seq
def _split_nmer(x, n, key, margin=3, colname='peptide'):
"""Row based method to split a peptide in to multiple n-mers
if it's too large. Returns a dataframe of 3 cols so should be
applied using iterrows and then use concat.
Args:
x: row item
n: length to split on
key:
"""
size = x.end-x.start
m = margin
if size <= n+m:
seq = _center_nmer(x, n)
return pd.DataFrame({colname: seq},index=[0])
else:
seq = x[key]
o=size%n
#print (size, o)
if o<=margin:
size=size-o
seq = _center_nmer(x, size)
#print (size)
seqs=[]
seq = x[key][x.start:x.end]
if x.start==0: s=1
else: s=0
for i in range(s, size, n):
if i+n>size:
seqs.append(seq[o:o+n])
#print (x.name,seq[x.start:x.start+n])
else:
seqs.append(seq[i:i+n])
#print (seq[i:i+n])
seqs = pd.Series(seqs)
d = pd.DataFrame({colname:seqs})
return d
[docs]def create_nmers(df, genome, length=20, seqkey='translation', key='nmer', how='split', margin=0):
"""Get n-mer peptide surrounding a set of sequences using the host
protein sequence.
Args:
df: input dataframe with sequence name and start/end coordinates
genome: genome dataframe with host sequences
length: length of nmer to return
seqkey: column name of sequence to be processed
how: method to create the n-mer, split will try to split up
the sequence into overlapping n-mes of length is larger than size
center will center the peptide
margin: do not split sequences below length+margin
Returns:
pandas Series with nmer values
"""
cols = ['locus_tag','gene','translation']
cols = list(set(cols) & set(genome.columns))
#merge with genome dataframe but must keep index for re-merging
if len(df)==0:
return
temp = df.merge(genome[cols],left_on='name',right_on='locus_tag',
how='left')#.set_index(df.index)
#print (temp)
if not 'end' in list(temp.columns):
temp = base.get_coords(temp)
#temp = base.get_coords(temp)
if how == 'center':
temp[key] = temp.apply( lambda r: _center_nmer(r, length), 1)
res = temp
elif how == 'split':
res=[]
for n,r in temp.iterrows():
d = _split_nmer(r, length, seqkey, margin, key)
d['index']=n
d.set_index('index',inplace=True)
res.append(d)
res = pd.concat(res)
#print (res)
res = temp.merge(res,left_index=True,right_index=True,how='right').reset_index(drop=True)
#print (res)
res=res.drop([seqkey],1)
return res
[docs]def get_overlaps(df1, df2, label='overlap', how='inside'):
"""
Overlaps for 2 sets of sequences where the positions in host sequence are stored
in each dataframe as 'start' and 'end' columns
Args:
df1 : first set of sequences, a pandas dataframe with columns called
start/end or pos
df2: second set of sequences
label: label for overlaps column
how: may be 'any' or 'inside'
Returns:
First DataFrame with no. of overlaps stored in a new column
"""
new=[]
a = base.get_coords(df1)
b = base.get_coords(df2)
def overlap(x,y):
f=0
#print x['name'],x.peptide
#print (x.start,x.end)
for i,r in y.iterrows():
if how == 'inside':
if ((x.start<=r.start) & (x.end>=r.end)):
f+=1
elif how == 'any':
if ((x.start<=r.start) & (x.end>r.start)) or \
((x.start>=r.start) & (x.start<r.end)):
#t = abs(r.start-x.start)
#print (a, b)
f+=1
#print (r.start,r.end, f)
return f
for n,df in a.groupby('name'):
found = b[b.name==n]
df[label] = df.apply(lambda r: overlap(r,found),axis=1)
new.append(df)
result = pd.concat(new)
#print ('%s with overlapping sequences' %len(result[result[label]>0]))
return result
[docs]def get_orthologs(seq, db=None, expect=1, hitlist_size=400, equery=None,
email=''):
"""
Fetch orthologous sequences using remote or local blast and return the records
as a dataframe.
Args:
seq: sequence to blast
db: the name of a local blast db
expect: expect value
equery: Entrez Gene Advanced Search options,
(see http://www.ncbi.nlm.nih.gov/books/NBK3837/)
Returns:
blast results in a pandas dataframe
"""
from Bio.Blast import NCBIXML,NCBIWWW
from Bio import Entrez, SeqIO
Entrez.email = email
print ('running blast..')
if db != None:
#local blast
SeqIO.write(SeqRecord(Seq(seq)), 'tempseq.faa', "fasta")
sequtils.local_blast(db, 'tempseq.faa', output='my_blast.xml', maxseqs=100)
result_handle = open("my_blast.xml")
df = sequtils.get_blast_results(result_handle)
else:
try:
result_handle = NCBIWWW.qblast("blastp", "nr", seq, expect=expect,
hitlist_size=500,entrez_query=equery)
time.sleep(2)
savefile = open("my_blast.xml", "w")
savefile.write(result_handle.read())
savefile.close()
result_handle = open("my_blast.xml")
df = sequtils.get_blast_results(result_handle, local=False)
except Exception as e:
print ('blast timeout')
return
df = df.drop(['subj','positive','query_length','score'],1)
df.drop_duplicates(subset=['definition','perc_ident'], inplace=True)
df = df[df['perc_ident']!=100]
return df
[docs]def alignment_to_dataframe(aln):
alnrows = [[str(a.id),str(a.seq)] for a in aln]
df = pd.DataFrame(alnrows,columns=['accession','seq'])
return df
[docs]def align_blast_results(df, aln=None, idkey='accession', productkey='definition'):
"""
Get gapped alignment from blast results using muscle aligner.
"""
sequtils.dataframe_to_fasta(df, idkey=idkey, seqkey='sequence',
descrkey=productkey, outfile='blast_found.faa')
aln = sequtils.muscle_alignment("blast_found.faa")
alnrows = [[a.id,str(a.seq)] for a in aln]
alndf = pd.DataFrame(alnrows,columns=['accession','seq'])
#res = df.merge(alndf, left_index=True, right_index=True)
res = df.merge(alndf, on=['accession'])
res = res.drop('sequence',1)
#get rid of duplicate hits
#res.drop_duplicates(subset=['definition','seq'], inplace=True)
res = res.sort_values(by='identity',ascending=False)
print ('%s hits, %s filtered' %(len(df), len(res)))
return res, aln
[docs]def get_species_name(s):
"""Find [species name] in blast result definition"""
m = re.search(r"[^[]*\[([^]]*)\]", s)
if m == None:
return s
return m.groups()[0]
[docs]def find_conserved_sequences(seqs, alnrows):
"""
Find if sub-sequences are conserved in given set of aligned sequences
Args:
seqs: a list of sequences to find
alnrows: a dataframe of aligned protein sequences
Returns:
a pandas DataFrame of 1 or 0 values for each protein/search sequence
"""
f=[]
for i,a in alnrows.iterrows():
sequence = a.seq
found = [sequence.find(j) for j in seqs]
f.append(found)
for n in ['species','accession','name']:
if n in alnrows.columns:
ind = alnrows[n]
break
s = pd.DataFrame(f,columns=seqs,index=ind)
s = s.replace(-1,np.nan)
s[s>0] = 1
res = s.count()
return s
[docs]def epitope_conservation(peptides, alnrows=None, proteinseq=None, blastresult=None,
blastdb=None, perc_ident=50, equery='srcdb_refseq[Properties]'):
"""
Find and visualise conserved peptides in a set of aligned sequences.
Args:
peptides: a list of peptides/epitopes
alnrows: a dataframe of previously aligned sequences e.g. custom strains
proteinseq: a sequence to blast and get an alignment for
blastresult: a file of saved blast results in plain csv format
equery: blast query string
Returns:
Matrix of 0 or 1 for conservation for each epitope/protein variant
"""
import seaborn as sns
sns.set_context("notebook", font_scale=1.4)
if alnrows is None:
if proteinseq == None:
print ('protein sequence to blast or alignment required')
return
if blastresult == None or not os.path.exists(blastresult):
blr = get_orthologs(proteinseq, equery=equery, blastdb=blastdb)
if blr is None:
return
#if filename == None: filename = 'blast_%s.csv' %label
blr.to_csv(blastresult)
else:
blr = pd.read_csv(blastresult, index_col=0)
#blr = blr[blr.perc_ident>=perc_ident]
alnrows, aln = align_blast_results(blr)
#print (sequtils.formatAlignment(aln))
if 'perc_ident' in alnrows.columns:
alnrows = alnrows[alnrows.perc_ident>=perc_ident]
if 'definition' in alnrows.columns:
alnrows['species'] = alnrows.definition.apply(get_species_name)
c = find_conserved_sequences(peptides, alnrows).T
c = c.dropna(how='all')
c = c.reindex_axis(c.sum(1).sort_values().index)
if len(c) == 0:
print ('no conserved epitopes in any sequence')
return
return c
def _region_query(P, eps, D):
neighbour_pts = []
for point in D:
if abs(P - point)<eps:
neighbour_pts.append(point)
return neighbour_pts
def _expand_cluster(P, neighbour_pts, C, c_n, eps, min_pts, D, visited):
flatten = lambda l: [i for sublist in l for i in sublist]
C[c_n].append(P)
for point in neighbour_pts:
if point not in visited:
visited.append(point)
neighbour_pts_2 = _region_query(point, eps, D)
if len(neighbour_pts_2) >= min_pts:
neighbour_pts += neighbour_pts_2
#print (point,C)
if point not in flatten(C):
C[c_n].append(point)
def _dbscan(D, eps=5, minsize=2):
"""
1D intervals using dbscan. Density-Based Spatial clustering.
Finds core samples of high density and expands clusters from them.
"""
from numpy.random import rand
noise = []
visited = []
C = []
c_n = -1
for point in D:
visited.append(point)
neighbour_pts = _region_query(point, eps, D)
if len(neighbour_pts) < minsize:
noise.append(point)
else:
C.append([])
c_n+=1
_expand_cluster(point, neighbour_pts, C, c_n,eps, minsize, D, visited)
C = [i for i in C if len(i)>=minsize]
#for cl in C:
# print (cl)
return C
[docs]def dbscan(B=None, x=None, dist=7, minsize=4):
"""Use dbscan algorithm to cluster binder positions"""
if B is not None:
if len(B)==0:
return
x = sorted(B.pos.astype('int'))
clusts = _dbscan(x, dist, minsize)
#print (clusts)
return clusts
[docs]def find_clusters(binders, dist=None, min_binders=2, min_size=12, max_size=50,
genome=None, colname='peptide'):
"""
Get clusters of binders for a set of binders.
Args:
binders: dataframe of binders
dist: distance over which to apply clustering
min_binders : minimum binders to be considered a cluster
min_size: smallest cluster length to return
max_size: largest cluster length to return
colname: name for cluster sequence column
Returns:
a pandas Series with the new n-mers (may be longer than the initial dataframe
if splitting)
"""
C=[]
grps = list(binders.groupby('name'))
length = binders.head(1).peptide.str.len().max()
#print (length)
if dist == None:
dist = length+1
#print ('using dist for clusters: %s' %dist)
for n,b in grps:
if len(b)==0: continue
clusts = dbscan(b,dist=dist,minsize=min_binders)
if len(clusts) == 0:
continue
for c in clusts:
gaps = [c[i]-c[i-1] for i in range(1,len(c))]
C.append([n,min(c),max(c)+length,len(c)])
if len(C)==0:
print ('no clusters')
return pd.DataFrame()
x = pd.DataFrame(C,columns=['name','start','end','binders'])
x['length'] = (x.end-x.start)
x = x[x['length']>=min_size]
x = x[x['length']<=max_size]
x=x.sort_values(['binders','length'],ascending=False)
if genome is not None:
cols = ['locus_tag','translation']
if 'gene' in genome.columns:
cols.append('gene')
x = x.merge(genome[cols],
left_on='name',right_on='locus_tag')
x[colname] = x.apply(lambda r: r.translation[r.start:r.end], 1)
x = x.drop(['locus_tag','translation'],1)
x = x.drop_duplicates(colname)
x = x.sort_values(by=['binders'],ascending=False)
x = x.reset_index(drop=True)
#print ('%s clusters found in %s proteins' %(len(x),len(x.groupby('name'))))
#print
return x
[docs]def randomize_dataframe(df, seed=8):
"""Randomize order of dataframe"""
np.random.seed(seed=seed)
new = df.reset_index(drop=True)
new = new.reindex(np.random.permutation(new.index))
return new
[docs]def save_to_excel(df, n=94, filename='peptide_lists'):
"""
Save a dataframe to excel with option of writing in chunks.
"""
writer = pd.ExcelWriter('%s.xls' %filename)
i=1
chunks = df.groupby(np.arange(len(df)) // n)
for g,c in chunks:
c.to_excel(writer,'list'+str(i))
i+=1
writer.save()
return
[docs]def tmhmm(fastafile=None, infile=None):
"""
Get TMhmm predictions
Args:
fastafile: fasta input file to run
infile: text file with tmhmm prediction output
"""
if infile==None:
tempfile = 'tmhmm_temp.txt'
cmd = 'tmhmm %s > %s' %(fastafile,tempfile)
infile = subprocess.check_output(cmd, shell=True, executable='/bin/bash')
tmpred = pd.read_csv(infile, delim_whitespace=True, comment='#',
names=['locus_tag','v','status','start','end'])
tmpred = tmpred.dropna()
print ('tmhmm predictions for %s proteins' %len(tmpred.groupby('locus_tag')))
lengths=[]
for i,row in tmpred.iterrows():
if row.status == 'TMhelix':
lengths.append(row.end-row.start)
#print np.mean(lengths), np.std(lengths)
return tmpred
[docs]def signalP(infile=None,genome=None):
"""Get signal peptide predictions"""
if genome != None:
seqfile = Genome.genbank2Fasta(genome)
tempfile = 'signalp_temp.txt'
cmd = 'signalp -t gram+ -f short %s > %s' %(seqfile,tempfile)
infile = subprocess.check_output(cmd, shell=True, executable='/bin/bash')
sp = pd.read_csv(infile,delim_whitespace=True,comment='#',skiprows=2,
names=['locus_tag','Cmax','cpos','Ymax','ypos','Smax',
'spos','Smean','D','SP','Dmaxcut','net'])
#print sp[sp.SP=='Y']
return sp
[docs]def get_seqdepot(seq):
"""Fetch seqdepot annotation for sequence"""
from epitopepredict import seqdepot
reload(seqdepot)
sd = seqdepot.new()
aseqid = sd.aseqIdFromSequence(seq)
try:
result = sd.findOne(aseqid)
except Exception as e:
print (e)
result=None
return result
[docs]def prediction_coverage(expdata, binders, key='sequence', perc=50, verbose=False):
"""
Determine hit rate of predictions in experimental data
by finding how many top peptides are needed to cover % positives
Args:
expdata: dataframe of experimental data with peptide sequence and name column
binders: dataframe of ranked binders created from predictor
key: column name in expdata for sequence
Returns:
fraction of predicted binders required to find perc total response
"""
def getcoverage(data, peptides, key):
#get coverage for single sequence
target = math.ceil(len(data)*perc/100.0)
if verbose == True:
print (len(data), target)
#print data[key]
#print peptides[peptides.isin(data[key])]
found=[]
count=0
for p in peptides:
for i,r in data.iterrows():
#print p, r[key]
if r[key] in found:
continue
if r[key].find(p)!=-1 or p.find(r[key])!=-1:
found.append(r[key])
if verbose == True:
print (count, p, r[key])
continue
count+=1
if len(found) >= target:
if verbose == True:
print (count, target)
print ('--------------')
return count
if verbose == True:
print ('not all sequences found', count, target)
return count
total = 0
for name, data in expdata.groupby('name'):
peptides = binders[binders.name==name].peptide
if len(peptides) == 0:
continue
if verbose == True: print (name)
#print (binders[binders.name==name][:5])
c = getcoverage(data, peptides, key)
total += c
#print (total, total/float(len(binders))*100)
return round(total/float(len(binders))*100,2)
[docs]def test_features():
"""test feature handling"""
fname = os.path.join(datadir,'MTB-H37Rv.gb')
df = sequtils.genbank2Dataframe(fname, cds=True)
df = df.set_index('locus_tag')
keys = df.index
name='Rv0011c'
row = df.ix[name]
seq = row.translation
prod = row['product']
rec = SeqRecord(Seq(seq),id=name,description=prod)
fastafmt = rec.format("fasta")
print (fastafmt)
print (row.to_dict())
ind = keys.get_loc(name)
previous = keys[ind-1]
if ind<len(keys)-1:
next = keys[ind+1]
else:
next=None
return
[docs]def testrun(gname):
method = 'tepitope'#'iedbmhc1'#'netmhciipan'
path='test'
gfile = os.path.join(genomespath,'%s.gb' %gname)
df = sequtils.genbank2Dataframe(gfile, cds=True)
#names = list(df.locus_tag[:1])
names=['VP24']
alleles1 = ["HLA-A*02:02", "HLA-A*11:01", "HLA-A*32:07", "HLA-B*15:17", "HLA-B*51:01",
"HLA-C*04:01", "HLA-E*01:03"]
alleles2 = ["HLA-DRB1*0101", "HLA-DRB1*0305", "HLA-DRB1*0812", "HLA-DRB1*1196", "HLA-DRB1*1346",
"HLA-DRB1*1455", "HLA-DRB1*1457", "HLA-DRB1*1612", "HLA-DRB4*0107", "HLA-DRB5*0203"]
P = base.getPredictor(method)
P.iedbmethod='IEDB_recommended' #'netmhcpan'
P.predictProteins(df,length=11,alleles=alleles2,names=names,
save=True, path=path)
f = os.path.join('test', names[0]+'.mpk')
df = pd.read_msgpack(f)
P.data=df
#b = P.get_binders(data=df)
#print b[:20]
base.getScoreDistributions(method, path)
return
[docs]def test_conservation(label,gname):
"""Conservation analysis"""
tag='VP24'
pd.set_option('max_colwidth', 800)
gfile = os.path.join(genomespath,'%s.gb' %gname)
g = sequtils.genbank2Dataframe(gfile, cds=True)
res = g[g['locus_tag']==tag]
seq = res.translation.head(1).squeeze()
print (seq)
#alnrows = getOrthologs(seq)
#alnrows.to_csv('blast_%s.csv' %tag)
alnrows = pd.read_csv('blast_%s.csv' %tag,index_col=0)
alnrows.drop_duplicates(subset=['accession'], inplace=True)
alnrows = alnrows[alnrows['perc_ident']>=60]
seqs=[SeqRecord(Seq(a.sequence),a.accession) for i,a in alnrows.iterrows()]
print (seqs[:2])
sequtils.distanceTree(seqs=seqs)#,ref=seqs[0])
#sequtils.ETETree(seqs, ref, metric)
#df = sequtils.getFastaProteins("blast_found.faa",idindex=3)
'''method='tepitope'
P = base.getPredictor(method)
P.predictSequences(df,seqkey='sequence')
b = P.get_binders()'''
return
[docs]def find_conserved_peptide(peptide, recs):
"""Find sequences where a peptide is conserved"""
f=[]
for i,a in recs.iterrows():
seq = a.sequence.replace('-','')
found = seq.find(peptide)
f.append(found)
s = pd.DataFrame(f,columns=['found'],index=recs.accession)
s = s.replace(-1,np.nan)
#print s
res = s.count()
return s
[docs]def test():
gname = 'ebolavirus'
label = 'test'
testrun(gname)
#testBcell(gname)
#testgenomeanalysis(label,gname,method)
#testconservation(label,gname)
#testFeatures()
return
if __name__ == '__main__':
pd.set_option('display.width', 600)
test()