# IODATA is an input and output module for quantum chemistry.
# Copyright (C) 2011-2019 The IODATA Development Team
#
# This file is part of IODATA.
#
# IODATA is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 3
# of the License, or (at your option) any later version.
#
# IODATA is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, see <http://www.gnu.org/licenses/>
# --
"""Extended XYZ file format.
The extended XYZ file format is defined in the
`ASE documentation <https://wiki.fysik.dtu.dk/ase/ase/io/formatoptions.html#xyz>`_.
Usually, the different frames in a trajectory describe different geometries of the same
molecule, with atoms in the same order. The ``load_many`` function below can also
handle an XYZ with different molecules, e.g. a molecular database.
"""
import shlex
from collections.abc import Iterator
import numpy as np
from ..docstrings import document_load_many, document_load_one
from ..periodic import num2sym, sym2num
from ..utils import LineIterator, LoadError, amu, angstrom, strtobool
from .xyz import load_one as load_one_xyz
__all__ = ()
PATTERNS = ["*.extxyz"]
def _convert_title_value(value: str):
"""Search for the correct dtype and convert the string."""
list_of_splits = value.split()
# If it is just one item, first try int, then float and finally return a bool or str
if len(list_of_splits) == 1:
value = value.strip()
if value.isdigit():
converted_value = int(value)
else:
try:
converted_value = float(value)
except ValueError:
try:
converted_value = strtobool(value)
except ValueError:
converted_value = value
else:
# Do the same but return it as a numpy array
try:
converted_value = np.array(list_of_splits, dtype=int)
except ValueError:
try:
converted_value = np.array(list_of_splits, dtype=float)
except ValueError:
try:
converted_value = np.array(
[strtobool(split) for split in list_of_splits], dtype=bool
)
except ValueError:
converted_value = np.array(list_of_splits, dtype=str)
return converted_value
def _parse_properties(properties: str, lit: LineIterator) -> list[tuple]:
"""Parse the properties listed in the title into atom_columns.
Parameters
----------
properties
The part of the title of an XYZ file containing column definitions.
lit
The LineIterator reading the file, only used for error messages.
Returns
-------
atom_columns
A list of tuples specifying the columns in the extended XYZ file.
For more details, see ``ATOM_COLUMNS_DOC`` in ``iodata.formats.xyz``.
"""
atom_columns = []
# Maps the dtype to the atom_columns dtype, load_word and dump_word
dtype_map = {
"S": (np.dtype("U25"), str, "{:10s}".format),
"R": (float, float, "{:15.10f}".format),
"I": (int, int, "{:10d}".format),
"L": (bool, strtobool, lambda boolean: "T" if boolean else "F"),
}
# Some predefined iodata attributes which can be mapped
# pos is assumed to be in angstrom, masses in amu (ase convention)
# No unit convertion takes place for the other attributes
atom_column_map = {
"pos": (
"atcoords",
None,
(3,),
float,
(lambda word: float(word) * angstrom),
(lambda value: f"{value / angstrom:15.10f}"),
),
"masses": (
"atmasses",
None,
(),
float,
(lambda word: float(word) * amu),
(lambda value: f"{value / amu:15.10f}"),
),
"force": (
"atgradient",
None,
(3,),
float,
(lambda word: -float(word)),
(lambda value: f"{-value:15.10f}"),
),
}
atnum_column = (
"atnums",
None,
(),
int,
(lambda word: int(word) if word.isdigit() else sym2num[word.title()]),
(lambda atnum: f"{num2sym[atnum]:2s}"),
)
splitted_properties = properties.split(":")
if len(splitted_properties) % 3 != 0:
raise LoadError(
f"Cannot parse property from the title line: '{properties}'. "
f"The expected format is name:dtype:shape.",
lit,
)
# Each property has 3 values: its name, dtype and shape
names = splitted_properties[::3]
dtypes = splitted_properties[1::3]
shapes = splitted_properties[2::3]
if "Z" in names:
# Try to map 'Z' to the 'atnums' attribute
atom_column_map["Z"] = atnum_column
elif "species" in names:
# If 'Z' is not present, use 'species'
atom_column_map["species"] = atnum_column
for name, dtype, shape in zip(names, dtypes, shapes):
if name in atom_column_map:
atom_columns.append(atom_column_map[name])
else:
# Use the 'extra' attribute to store values which are not predefined in iodata
shape_suffix = () if shape == "1" else (int(shape),)
atom_columns.append(("extra", name, shape_suffix, *dtype_map[dtype]))
return atom_columns
def _parse_title(title: str, lit: LineIterator) -> tuple[list[tuple], dict[str]]:
"""Parse the title in an extended xyz file.
Parameters
----------
title
The full title line.
lit
The LineIterator reading the file, only used for error messages.
Returns
-------
atom_columns
A list of tuples specifying the columns in the extended XYZ file.
For more details, see ``ATOM_COLUMNS_DOC`` in ``iodata.formats.xyz``.
data
Attributes to be included in the ``IOData`` instance,
taken from the title line.
"""
key_value_pairs = shlex.split(title)
# A dict of predefined iodata atrributes with their names and dtype convertion functions
def load_cellvecs(word):
return np.array(word.split(), dtype=float).reshape([3, 3]) * angstrom
iodata_attrs = {
"energy": ("energy", float),
"Lattice": ("cellvecs", load_cellvecs),
"charge": ("charge", float),
}
data = {}
for key_value_pair in key_value_pairs:
if "=" in key_value_pair:
key, value = key_value_pair.split("=", 1)
if key == "Properties":
atom_columns = _parse_properties(value, lit)
elif key in iodata_attrs:
data[iodata_attrs[key][0]] = iodata_attrs[key][1](value)
else:
data.setdefault("extra", {})[key] = _convert_title_value(value)
else:
# If no value is given, set it True
data.setdefault("extra", {})[key_value_pair] = True
return atom_columns, data
[docs]
@document_load_one(
"EXTXYZ",
["title"],
["atcoords", "atgradient", "atmasses", "atnums", "cellvecs", "charge", "energy", "extra"],
)
def load_one(lit: LineIterator) -> dict:
"""Do not edit this docstring. It will be overwritten."""
atom_line = next(lit)
title_line = next(lit)
# parse title
atom_columns, title_data = _parse_title(title_line, lit)
lit.back(title_line)
lit.back(atom_line)
xyz_data = load_one_xyz(lit, atom_columns)
# If the extra attribute is present, prevent it from overwriting itself
if "extra" in title_data and "extra" in xyz_data:
xyz_data["extra"].update(title_data["extra"])
title_data.update(xyz_data)
return title_data
[docs]
@document_load_many(
"EXTXYZ",
["title"],
["atcoords", "atgradient", "atmasses", "atnums", "cellvecs", "charge", "energy", "extra"],
)
def load_many(lit: LineIterator) -> Iterator[dict]:
"""Do not edit this docstring. It will be overwritten."""
# XYZ Trajectory files are a simple concatenation of individual XYZ files,'
# making it trivial to load many frames.
try:
while True:
# Check for and skip empty lines at the end of file
line = next(lit)
if line.strip() == "":
return
lit.back(line)
yield load_one(lit)
except StopIteration:
return