Source code for epitopepredict.utilities

#!/usr/bin/env python

"""
    Utilities for epitopepredict
    Created March 2013
    Copyright (C) Damien Farrell
"""

from __future__ import absolute_import, print_function
import os, math, csv, string
import shutil
import numpy as np
import pandas as pd
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio import PDB

home = os.path.expanduser("~")

[docs]def venndiagram(names, labels, ax=None, colors=('r','g','b'), **kwargs):
    """Plot a venn diagram"""

    from matplotlib_venn import venn2,venn3
    import pylab as plt
    f=None
    if ax==None:
        f=plt.figure(figsize=(4,4))
        ax=f.add_subplot(111)
    if len(names)==2:
        n1,n2=names
        v = venn2([set(n1), set(n2)], set_labels=labels, set_colors=colors, **kwargs)
    elif len(names)==3:
        n1,n2,n3=names
        v = venn3([set(n1), set(n2), set(n3)], set_labels=labels, set_colors=colors, **kwargs)
    ax.axis('off')
    #f.patch.set_visible(False)
    ax.set_axis_off()
    return f

[docs]def compress(filename, remove=False):
    """Compress a file with gzip"""
    import gzip
    fin = open(filename, 'rb')
    fout = gzip.open(filename+'.gz', 'wb')
    fout.writelines(fin)
    fout.close()
    fin.close()
    if remove == True:
        os.remove(filename)
    return

[docs]def rmse(ar1, ar2):
    """Mean squared error"""
    ar1 = np.asarray(ar1)
    ar2 = np.asarray(ar2)
    dif = ar1 - ar2
    dif *= dif
    return np.sqrt(dif.sum()/len(ar1))

[docs]def add_dicts(a, b):
    return dict((n, a.get(n, 0)+b.get(n, 0)) for n in set(a)|set(b))

[docs]def copyfile(source, dest, newname=None):
    """Helper method to copy files"""

    if not os.path.exists(source):
        #print 'no such file %s' %source
        return False
    shutil.copy(source, newname)
    dest = os.path.join(dest, newname)
    if os.path.exists(dest):
        os.remove(dest)
    shutil.move(newname, dest)
    return True

[docs]def copyfiles(path, files):
    for f in files:
        src = os.path.join(path, f)
        print (src)
        if not os.path.exists(src):
            return False
        shutil.copy(src, f)
    return True

[docs]def symmetrize(m, lower=True):
    """Return symmetric array"""
    m=m.fillna(0)
    if lower==True:
        return np.tril(m) + np.triu(m.T) - np.diag(np.diag(m))
    else:
        return np.triu(m) + np.tril(m.T) - np.diag(np.diag(m))

[docs]def get_symmetric_data_frame(m):
    x = symmetrize(m)
    return pd.DataFrame(x, columns=m.columns,index=m.index)

[docs]def find_filefrom_string(files, string):
    for f in files:
        if string in os.path.splitext(f)[0]:
            return f
    return ''

[docs]def find_files(path, ext='txt'):
    """List files in a dir of a specific type"""
    if not os.path.exists(path):
        print ('no such directory: %s' %path)
        return []
    files=[]
    for dirname, dirnames, filenames in os.walk(path):
        for f in filenames:
            name = os.path.join(dirname, f)
            if f.endswith(ext):
                files.append(name)
    return files

[docs]def find_folders(path):
    if not os.path.exists(path):
        print ('no such directory: %s' %path)
        return []
    dirs = []
    for dirname, dirnames, filenames in os.walk(path):
        dirs.append(dirname)
    return dirs

[docs]def reorder_filenames(files, order):
    """reorder filenames by another list order(seqs)"""
    new = []
    for i in order:
        found=False
        for f in files:
            if i in f:
                new.append(f)
                found=True
        if found==False:
            new.append('')
    return new

[docs]def read_iedb(filename, key='Epitope ID'):
    """Load iedb peptidic csv file and return dataframe"""
    #cr = csv.reader(open(filename,'r'))
    cr = csv.DictReader(open(filename,'r'),quotechar='"')
    cr.fieldnames = [field.strip() for field in cr.fieldnames]
    D={}
    for r in cr:
        k = r[key]
        D[k] = r
    return D

[docs]def get_sequencefrom_pdb(pdbfile, chain='C', index=0):
    """Get AA sequence from PDB"""
    parser = PDB.PDBParser(QUIET=True)
    struct = parser.get_structure(pdbfile,pdbfile)
    ppb = PDB.PPBuilder()
    model = struct[0]
    peptides = ppb.build_peptides(model[chain])
    seq=''
    for i,pep in enumerate(peptides):
        seq+=str(pep.get_sequence())
    return seq

[docs]def filter_iedb_file(filename, field, search):
    """Return filtered iedb data"""
    X = pd.read_csv(filename)
    cols = ['PubMed ID','Author','Journal','Year','T Cell ID','MHC Allele Name',
                'Epitope Linear Sequence','Epitope Source Organism Name']
    y = X[X[field].str.contains(search)]
    print (y[cols])
    y.to_csv('filtered.csv',cols=cols)
    return y

[docs]def search_pubmed(term, max_count=100):

    from Bio import Entrez
    from Bio import Medline

    def fetch_details(id_list):
        ids = ','.join(id_list)
        Entrez.email = 'your.email@example.com'
        handle = Entrez.efetch(db='pubmed',
                               retmode='xml',
                               id=ids)
        results = Entrez.read(handle)
        return results

    def search(query):
        Entrez.email = 'your.email@example.com'
        handle = Entrez.esearch(db='pubmed',
                                sort='relevance',
                                retmax=max_count,
                                retmode='xml',
                                term=query)
        results = Entrez.read(handle)
        return results

    results = search(term)
    id_list = results['IdList']
    papers = fetch_details(id_list)
    for i, paper in enumerate(papers):
        print("%d) %s" % (i+1, paper['MedlineCitation']['Article']['ArticleTitle']))
        # Pretty print the first paper in full to observe its structure
        #import json
        #print(json.dumps(papers[0], indent=2, separators=(',', ':')))

[docs]def test():
    sourcefasta = os.path.join(home,'dockingdata/fastafiles/1KLU.fasta')
    findClosestStructures(sourcefasta)
    #fetchPDBList('MHCII_homologs.csv')

if __name__ == '__main__':
    test()