Source code for iodata.formats.pdb

# IODATA is an input and output module for quantum chemistry.
# Copyright (C) 2011-2019 The IODATA Development Team
#
# This file is part of IODATA.
#
# IODATA is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 3
# of the License, or (at your option) any later version.
#
# IODATA is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, see <http://www.gnu.org/licenses/>
# --
"""PDB file format.

There are different formats of pdb files. The convention used here is the
last updated one and is described in this link:
http://www.wwpdb.org/documentation/file-format-content/format33/v3.3.html
"""

from collections.abc import Iterator
from typing import TextIO
from warnings import warn

import numpy as np

from ..docstrings import (
    document_dump_many,
    document_dump_one,
    document_load_many,
    document_load_one,
)
from ..iodata import IOData
from ..periodic import bond2num, num2sym, sym2num
from ..utils import LineIterator, LoadError, LoadWarning, angstrom

__all__ = ()


PATTERNS = ["*.pdb"]


def _parse_pdb_atom_line(line, lit):
    """Parse an ATOM or HETATM line from a PDB file.

    Parameters
    ----------
    line
        A string with a single ATOM or HETATM line.
    lit
        The line iterator which read the line, used for generating warnings when needed.

    Returns
    -------
    atnum
        Atomic number.
    atname
        The atom name.
    resname
        The residua name.
    chainid
        The chain ID.
    resnum
        The residue number.
    atcoord
        Cartesian coordinates of the atomic nucleus.
    occupancy
        The occupancy (usually from the XRD analysis).
    bfactor
        The temperature factor (usually from the XRD analysis).

    """
    # Overview of ATOM records
    #     COLUMNS        DATA  TYPE    FIELD        DEFINITION
    # -------------------------------------------------------------------------------------
    #  1 -  6        Record name   "ATOM  "
    #  7 - 11        Integer       serial       Atom  serial number.
    # 13 - 16        Atom          name         Atom name.
    # 17             Character     altLoc       Alternate location indicator.
    # 18 - 20        Residue name  resName      Residue name.
    # 22             Character     chainID      Chain identifier.
    # 23 - 26        Integer       resSeq       Residue sequence number.
    # 27             AChar         iCode        Code for insertion of residues.
    # 31 - 38        Real(8.3)     x            Orthogonal coordinates for X in Angstroms.
    # 39 - 46        Real(8.3)     y            Orthogonal coordinates for Y in Angstroms.
    # 47 - 54        Real(8.3)     z            Orthogonal coordinates for Z in Angstroms.
    # 55 - 60        Real(6.2)     occupancy    Occupancy.
    # 61 - 66        Real(6.2)     tempFactor   Temperature  factor.
    # 77 - 78        LString(2)    element      Element symbol, right-justified.
    # 79 - 80        LString(2)    charge       Charge  on the atom.

    # Get element symbol from position 77:78 in pdb format
    symbol = line[76:78].strip()
    if len(symbol) > 0:
        atnum = sym2num.get(symbol)
    else:
        # If not present, guess it from position 13:16 (atom name)
        atname = line[12:16].strip()
        atnum = sym2num.get(atname, sym2num.get(atname[:2].title(), sym2num.get(atname[0], None)))
        warn(
            LoadWarning("Using the atom name in the PDB file to guess the chemical element.", lit),
            stacklevel=2,
        )
    if atnum is None:
        atnum = 0
        warn(
            LoadWarning(
                f"Failed to determine the atomic number. atname='{atname}' symbol='{symbol}'", lit
            ),
            stacklevel=2,
        )

    # atom name, residue name, chain id, & residue sequence number
    atname = line[12:16].strip()
    resname = line[17:20].strip()
    chainid = line[21]
    resnum = int(line[22:26])
    # add x, y, and z
    atcoord = [
        float(line[30:38]) * angstrom,
        float(line[38:46]) * angstrom,
        float(line[46:54]) * angstrom,
    ]
    # get occupancies & temperature factor
    occupancy = float(line[54:60])
    bfactor = float(line[60:66])
    return atnum, atname, resname, chainid, resnum, atcoord, occupancy, bfactor


def _parse_pdb_conect_line(line):
    # Overview of CONECT records
    # COLUMNS       DATA  TYPE      FIELD        DEFINITION
    # -------------------------------------------------------------------------
    #  1 -  6       Record name    "CONECT"
    #  7 - 11       Integer        serial       Atom  serial number
    # 12 - 16       Integer        serial       Serial number of bonded atom
    # 17 - 21       Integer        serial       Serial number of bonded atom
    # 22 - 26       Integer        serial       Serial number of bonded atom
    # 27 - 31       Integer        serial       Serial number of bonded atom
    iatom0 = int(line[7:12]) - 1
    for ipos in 12, 17, 22, 27:
        serial_str = line[ipos : ipos + 5].strip()
        if serial_str != "":
            iatom1 = int(serial_str) - 1
            if iatom1 > iatom0:
                yield iatom0, iatom1



[docs]
@document_load_one("PDB", ["atcoords", "atnums", "atffparams", "extra"], ["title", "bonds"])
def load_one(lit: LineIterator) -> dict:
    """Do not edit this docstring. It will be overwritten."""
    title_lines = []
    compnd_lines = []
    atnums = []
    attypes = []
    restypes = []
    chainids = []
    resnums = []
    atcoords = []
    occupancies = []
    bfactors = []
    bonds = []
    molecule_found = False
    end_reached = False
    while True:
        try:
            line = next(lit)
        except StopIteration:
            break
        # If the PDB file has a title, replace the default.
        if line.startswith("TITLE"):
            title_lines.append(line[10:].strip())
        if line.startswith("COMPND"):
            compnd_lines.append(line[10:].strip())
        if line.startswith(("ATOM", "HETATM")):
            (atnum, attype, restype, chainid, resnum, atcoord, occupancy, bfactor) = (
                _parse_pdb_atom_line(line, lit)
            )
            atnums.append(atnum)
            attypes.append(attype)
            restypes.append(restype)
            chainids.append(chainid)
            resnums.append(resnum)
            atcoords.append(atcoord)
            occupancies.append(occupancy)
            bfactors.append(bfactor)
            molecule_found = True
        if line.startswith("CONECT"):
            for iatom0, iatom1 in _parse_pdb_conect_line(line):
                bonds.append([iatom0, iatom1, bond2num["un"]])
        if line.startswith("END") and molecule_found:
            end_reached = True
            break
    if not molecule_found:
        raise LoadError("Molecule could not be read.", lit)
    if not end_reached:
        warn(
            LoadWarning("The END is not found, but the parsed data is returned.", lit), stacklevel=2
        )

    # Data related to force fields
    atffparams = {
        "attypes": np.array(attypes),
        "restypes": np.array(restypes),
        "resnums": np.array(resnums),
    }
    # Extra data
    extra = {
        "occupancies": np.array(occupancies),
        "bfactors": np.array(bfactors),
    }
    if len(compnd_lines) > 0:
        extra["compound"] = "\n".join(compnd_lines)
    # add chain id, if it wasn't all empty
    if not np.all(chainids == [" "] * len(chainids)):
        extra["chainids"] = np.array(chainids)
    # Set a useful title
    if len(title_lines) == 0:
        # Some files use COMPND instead of TITLE, in which case COMPND will be
        # used as title.
        if "compound" in extra:
            title = extra["compound"]
            del extra["compound"]
        else:
            title = "PDB file loaded by IOData"
    else:
        title = "\n".join(title_lines)
    result = {
        "atcoords": np.array(atcoords),
        "atnums": np.array(atnums),
        "atffparams": atffparams,
        "title": title,
        "extra": extra,
    }
    # assign bonds only if some were present
    if len(bonds) > 0:
        result["bonds"] = np.array(bonds)
    return result




[docs]
@document_load_many("PDB", ["atcoords", "atnums", "atffparams", "extra"], ["title"])
def load_many(lit: LineIterator) -> Iterator[dict]:
    """Do not edit this docstring. It will be overwritten."""
    # PDB files with more molecules are a simple concatenation of individual PDB files,'
    # making it trivial to load many frames.
    try:
        while True:
            yield load_one(lit)
    except (StopIteration, LoadError):
        return



def _dump_multiline_str(f: TextIO, key: str, value: str):
    r"""Write a multiline string in PDB format.

    Parameters
    ----------
    f
        A file object to write to.
    key
        The key used to prefix the multiline string, e.g. `"TITLE"`.
    value
        A (multiline) string, with multiple lines separated by `\n`.

    """
    prefix = key.ljust(10)
    for iline, line in enumerate(value.split("\n")):
        print(prefix + line, file=f)
        prefix = key + str(iline + 2).rjust(10 - len(key)) + " "



[docs]
@document_dump_one("PDB", ["atcoords", "atnums", "extra"], ["atffparams", "title", "bonds"])
def dump_one(f: TextIO, data: IOData):
    """Do not edit this docstring. It will be overwritten."""
    _dump_multiline_str(f, "TITLE", data.title or "Created with IOData")
    if "compound" in data.extra:
        _dump_multiline_str(f, "COMPND", data.extra["compound"])
    # Prepare for ATOM lines.
    attypes = data.atffparams.get("attypes", None)
    restypes = data.atffparams.get("restypes", None)
    resnums = data.atffparams.get("resnums", None)
    occupancies = data.extra.get("occupancies", None)
    bfactors = data.extra.get("bfactors", None)
    chainids = data.extra.get("chainids", None)
    # Write ATOM lines.
    for i in range(data.natom):
        n = num2sym[data.atnums[i]]
        resnum = -1 if resnums is None else resnums[i]
        x, y, z = data.atcoords[i] / angstrom
        occ = 1.00 if occupancies is None else occupancies[i]
        b = 0.00 if bfactors is None else bfactors[i]
        attype = str(n + str(i + 1)) if attypes is None else attypes[i]
        restype = "XXX" if restypes is None else restypes[i]
        chain = " " if chainids is None else chainids[i]
        out1 = f"{i + 1:>5d} {attype:<4s} {restype:3s} {chain:1s}{resnum:>4d}    "
        out2 = f"{x:8.3f}{y:8.3f}{z:8.3f}{occ:6.2f}{b:6.2f}{n:>12s}"
        print("ATOM  " + out1 + out2, file=f)
    if data.bonds is not None:
        # Prepare for CONECT lines.
        connections = [[] for iatom in range(data.natom)]
        for iatom0, iatom1 in data.bonds[:, :2]:
            connections[iatom0].append(iatom1)
            connections[iatom1].append(iatom0)
        # Write CONECT lines.
        for iatom0, iatoms1 in enumerate(connections):
            if len(iatoms1) > 0:
                # Write connection in groups of max 4
                for ichunk in range(len(iatoms1) // 4 + 1):
                    other_atoms_str = "".join(
                        f"{iatom1 + 1:5d}" for iatom1 in iatoms1[ichunk * 4 : ichunk * 4 + 4]
                    )
                    conect_line = f"CONECT{iatom0 + 1:5d}{other_atoms_str}"
                    print(conect_line, file=f)
    print("END", file=f)




[docs]
@document_dump_many("PDB", ["atcoords", "atnums", "extra"], ["atffparams", "title"])
def dump_many(f: TextIO, datas: Iterator[IOData]):
    """Do not edit this docstring. It will be overwritten."""
    # Similar to load_many, this is relatively easy.
    for data in datas:
        dump_one(f, data)