From 1cec0b90e4173e05003e00b0c73e0d59f0f67b25 Mon Sep 17 00:00:00 2001
From: Paul Kienzle <pkienzle@nist.gov>
Date: Thu, 5 Mar 2026 15:45:58 -0500
Subject: [PATCH 01/19] Explore lark as replacement for the pyparsing formula
 parser

---
 explore/lark_parse.py | 712 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 712 insertions(+)
 create mode 100644 explore/lark_parse.py

diff --git a/explore/lark_parse.py b/explore/lark_parse.py
new file mode 100644
index 0000000..aea2724
--- /dev/null
+++ b/explore/lark_parse.py
@@ -0,0 +1,712 @@
+import lark
+import periodictable as pt
+from periodictable.core import PeriodicTable
+from periodictable.core import default_table
+from periodictable.formulas import from_subscript, Formula, _mix_by_weight_pairs, _mix_by_volume_pairs
+from periodictable.formulas import VOLUME_UNITS, MASS_UNITS, LENGTH_UNITS
+
+grammar = """
+start      : SPACE? formula SPACE? # strip blank space from start and end
+formula    : compound | mixture
+
+# Mixture definitions:  quantity compound // quantity compound // quantity compound
+# Activation only cares about total mass, so you can freely mix masses and volumes if
+# you have the density for each component. Scattering cares about density of the mixture,
+# which in general is different from the mixture of densities.
+# To convert layers to masses for activation estimates we need density. Also need to scale by
+# area to convert density and thickness to mass. Assume unit area is cm^2, so for
+# example "4 (5 nm Ni // 2 mm Si)" is a 4 cm^2 wafer of nickel on silicon. If you
+# were to add a polymer you would need its density: "4 (20 nm C5H10@1.2
+
+mixture    : byamount | byvolume | byweight | layers
+byamount   : quantity compound (MIX quantity compound)*
+byvolume   : volumepct compound (MIX percentage compound)* MIX compound
+byweight   : weightpct compound (MIX percentage compound)* MIX compound
+layers     : thickness compound (MIX thickness compound)*
+quantity   : NUMBER SPACE? (MASS | VOLUME) SPACE
+weightpct  : NUMBER SPACE? WEIGHTPCT SPACE
+volumepct  : NUMBER SPACE? VOLUMEPCT SPACE
+thickness  : NUMBER SPACE? LENGTH SPACE
+percentage : NUMBER SPACE? "%" SPACE  # Allows "3 % "
+
+# Compound definition: number group ... @ density where group is El count El count ...
+# FASTA sequences: (rna|dna|aa) : SEQUENCE @ density
+# Density applies to the entire formula, such as "NaCl + 29.2H2O @ 1.07n"
+# If you do this as a mixture you need parentheses: "(10 wt% NaCl // H2O)@1.07n"
+# Note: `[token]` leaves a None placeholder in the tree, unlike `token?`
+compound   : (composite | fasta) [density]
+fasta      : FASTA ":" SEQUENCE
+FASTA      : /dna|rna|aa/
+SEQUENCE   : /[A-Z -*]+/
+composite  : [NUMBER] group (SEPARATOR [NUMBER] group)*
+group      : ((atom | "(" formula ")") [COUNT])+
+atom       : SYMBOL [isotope] [charge]
+# could list all elements, but better error reporting if element symbol lookup fails
+SYMBOL     : /[A-Z][a-z]*/
+isotope    : "[" INTEGER "]"
+charge     : "{" [INTEGER] CHARGE "}" | [SUPERINT] SUPERCHARGE
+density    : SPACE? "@" SPACE? NUMBER [DENSITYMODE]
+
+# Tokens
+CHARGE     : /[+]+|[-]+/  # allow charge using {++} or {--}
+SUPERCHARGE: /\u207A+|\u207B+/  # Allow Ca++ and Cl- using superscript + and -
+DENSITYMODE: /[ni]/
+MIX        : SPACE? "//" SPACE?
+# maybe drop "wt%" and "vol%"
+WEIGHTPCT  : /%w((eigh)?t)?/ | /w((eigh)?t)?%/ | /%m(ass)?/ | /m(ass)?%/
+VOLUMEPCT  : /%v(ol(ume)?)?/ | /v(ol(ume)?)?%/
+MASS       : "kg" | "g" | "mg" | "ug" | "μg" | "ng"
+VOLUME     : "L" | "mL" | "uL" | "μL" | "nL"
+LENGTH     : "cm" | "mm" | "um" | "μm" | "nm"
+
+SEPARATOR  : SPACE? /[+•·]/ SPACE? | SPACE
+SPACE      : /[ \\t\\n\\r]+/
+COUNT      : NUMBER | SUBNUM  # atom counts can be normal numbers or unicode subscripts
+NUMBER     : INTEGER | FRACTION
+INTEGER    : /[1-9][0-9]*/
+FRACTION   : /([1-9][0-9]*|0)?[.][0-9]*/  # allow all floats?
+SUBNUM     : SUBINT | SUBFRAC
+SUBINT     : /(\u2080|[\u2081-\u2089][\u2080-\u2089]*)/
+SUBFRAC    : /(\u2080|[\u2081-\u2089][\u2080-\u2089]*|)([.][\u2080-\u2089]*)/
+SUPERINT   : /(\u2070|[\u00B9\u00B2\u00B3\u2074-\u2079][\u2070\u00B9\u00B2\u00B3\u2074-\u2079]*)/
+"""
+
+parser = lark.Lark(grammar)
+
+def from_superscript(value: str) -> str:
+    """
+    Convert unicode superscript characters to normal characters. This allows us to parse,
+    for example, Ca²⁺ as Ca{2+}.
+    """
+    codepoints = {
+        '\u2070': '0', '\u00B9': '1', '\u00B2': '2', '\u00B3': '3',
+        '\u2074': '4', '\u2075': '5', '\u2076': '6', '\u2077': '7',
+        '\u2078': '8', '\u2079': '9', '\u207a': '+', '\u207b': '-',
+        '\u207c': '=', '\u207d': '(', '\u207e': ')',
+
+        '\u2071': 'i', '\u207f': 'n',
+    }
+    return ''.join(codepoints.get(char, char) for char in str(value))
+
+def int_or_float(s):
+    f = float(s)
+    i = int(f)
+    return i if i == f else f
+
+class StripJunk(lark.Transformer):
+    """
+    Token stripper visitor class.
+
+    This is done separately from the formula composer so that we can show the cleaned tree
+    before debugging the conversion.
+    """
+    def SEPARATOR(self, _):
+        """Strip token for molecular fragment separator (+ or center dot or spaces)."""
+        return lark.Discard
+    def MIX(self, _):
+        """Strip token for mixture separator //."""
+        return lark.Discard
+    def SPACE(self, _):
+        """Strip token for (usually optional) spaces."""
+        return lark.Discard
+    def WEIGHTPCT(self, _):
+        """Strip token for wt% mixture indicator."""
+        return lark.Discard
+    def VOLUMEPCT(self, _):
+        """Strip token for vol% mixture indicator."""
+        return lark.Discard
+
+class ConvertTokens(lark.Transformer):
+    """
+    Syntax tree to formula conversion class.
+    """
+    def __init__(self, text, table=None):
+        """
+        *text* is the original formula string.
+
+        *table* is an optional alternative periodic table.
+        """
+        self._context = text
+        self._table = default_table(table)
+
+    def VOLUME(self, token: lark.Token) -> tuple[str, str]:
+        """
+        Convert VOLUME terminal ('volume', unit) pair.
+
+        Unit is a volume unit, such as mL or uL for microlitres.
+        """
+        return 'volume', token.value
+    def MASS(self, token: lark.Token) -> tuple[str, str]:
+        """
+        Convert MASS terminal to ('mass', unit) pair.
+
+        Unit is a mass unit, such as g or mg.
+        """
+        return 'mass', token.value
+    def LENGTH(self, token: lark.Token) -> tuple[str, str]:
+        """
+        Convert LENGTH terminal to ('length', unit) pair.
+
+        Unit is a length unit, such as cm or nm.
+        """
+        return 'length', token.value
+    def NUMBER(self, token: lark.Token) -> int|float:
+        """
+        Convert string to float or integer.
+
+        Numbers are used for quantities and percentages in mixtures, and for multiplier
+        counts to molecule fragments.
+        """
+        return int_or_float(token.value)
+    def INTEGER(self, token: lark.Token) -> int:
+        """
+        Convert string to float or integer
+        """
+        return int(token.value)
+    def COUNT(self, token: lark.Token) -> int|float:
+        """
+        Return the count value for a group component.
+
+        Count is specified after the symbol, either as an ASCII number or using subscript digits.
+        The period separator for fractional counts uses ASCII in both cases (there is no subscript
+        period charcter available). If the count is fractional return it as a float, otherwise
+        return it as an integer.
+        """
+        return int_or_float(from_subscript(token.value))
+    def SUPERINT(self, token):
+        """
+        Return the integer value of a sequence of superscript digits.
+
+        This is used in the charge rule as part of the valence specification for the atom.
+        """
+        return int(from_superscript(token.value))
+    def DENSITYMODE(self, token):
+        """
+        Return the value of the DENSITYMODE token, either "n" or "i". If no mode is specified
+        then a token value of None will be given to the density rule.
+        """
+        return token.value
+    def CHARGE(self, token):
+        """
+        Return a sequence of plus and minus characters. By grammar rules they must all have
+        the same sign.
+
+        This is used in the charge rule as part of the valence specification for the atom.
+        """
+        return token.value
+    def SUPERCHARGE(self, token):
+        """
+        Convert sequence of superscript plus and minus characters to ASCII plus and minus.
+
+        This is used in the charge rule as part of the valence specification for the atom.
+        """
+        return from_superscript(token.value)
+    def SYMBOL(self, token):
+        """
+        Look up the element in the periodic table and return it.
+
+        Raise ValueError if the element doesn't exist.
+        """
+        try:
+            return self._table.symbol(token.value)
+        except Exception:
+            raise ValueError(f"Element {token.value} doesn't exist")
+    def FASTA(self, token):
+        """
+        Return the token value as the fasta sequence type: "dna", "rna" or "aa".
+        """
+        return token.value
+    def SEQUENCE(self, token):
+        """
+        Return the token value as the fasta sequence string.
+        """
+        return token.value
+    def fasta(self, tokens):
+        """
+        Return a fasta sequence and its type.
+
+        Transform: [type, sequence] => ('fasta', type, sequence)
+        """
+        stype, sequence = tokens
+        return 'fasta', stype, sequence
+    def isotope(self, tokens):
+        """
+        Return the isotope number for the atom.
+
+        Transform: [isotope] => isotope
+        """
+        return tokens[0]
+    def charge(self, tokens):
+        """
+        Return valence from number and sign.
+
+        Valence is either a number followed by plus or minus, or a sequence of plus
+        or minus. If the number was specified it will already have been converted
+        to a value, otherwise use the length of the charge string as the value.
+
+        The valence can be given using superscript or regular ASCII number and sign
+        symbols. If ASCII then they need to be wrapped in braces such as Ca{2+}. The
+        token transform handles the conversion from superscript to ASCII characters
+        and the conversion from string to number.
+
+        Raise ValueError if a number was supplied along with multiple charge symbols.
+
+        Transform: [number|None, 'charge'] => valence
+
+        Example: ['{1+}'] => [1, '+'] = Ca.ion[1]
+        # Ca{++} => [None, '++'] = Ca.ion[2]
+        # Ca{3--} => [3, '--'] = Ca.ion[-3]  # value has precedence over charge
+        """
+        print("in charge with", tokens)
+        value, charge = tokens
+        if value is None:
+            value = len(charge)
+        elif value and len(charge) > 1:
+            self._raise_error(None, f"Using values of {value} for {value}{charge}")
+        valence = value if charge[0] == '+' else -value
+        return valence
+    def atom(self, tokens):
+        """
+        Returns an atom from the periodic table.
+
+        Usually this will use elements from the default table, but if an alternate table is
+        provided to the ConvertTokens constructor then that will be used to retrieve the element
+        from the symbol.
+
+        Isotope and charge are optional. By using the rule "SYMBOL [isotope] [charge|supercharge]"
+        with "[opt]" for the optional components rather "opt?", the missing components appear
+        as None in the list of tokens. The "supercharge" option allows unicode superscripts to
+        be used to specify charge rather than curly braces "{charge}".
+
+        Raises an error if the symbol does not exist, does not have that isotope or doesn't
+        allow that charge.
+
+        Transform: ['symbol', isotope|None, charge|None] => atom
+
+        Example: ['H', 1, 1] => H[1]{+}
+
+        Example: ['Ca', None, 2] => Ca{2+}
+        """
+        #print("atom", tokens)
+        el, iso, ion = tokens
+        if iso and ion:
+            atom = el[iso].ion[ion]
+        elif iso:
+            atom = el[iso]
+        elif ion:
+            atom = el.ion[ion]
+        else:
+            atom = el
+        #print(f"atom {tokens} => {atom}")
+        return atom
+
+    def group(self, tokens):
+        """
+        Returns a sequence of (count, item) pairs, where item is an atom or a nested formula.
+        Missing counts default to 1.
+
+        Transform: [atom|formula, count|None, ...] => ((count, atom|formula), ...)
+        """
+        tokens = [1 if value is None else value for value in tokens]
+        pairs = tuple((count, item) for item, count in zip(tokens[::2], tokens[1::2]))
+        return pairs
+
+    def composite(self, tokens):
+        """
+        Returns a sequence of (number, group) pairs. Each group is a sequence of (count, item)
+        pairs, where item is an atom or a nested formula. Missing numbers default to 1.
+
+        Transform: [number|None, group, ...] => ((number, group), ...) | ((count, atom), ...)
+
+        Example CaCO3 6H2O: None, ((1, Ca), (1, C), (3, O)), 6, ((2, H), (1, O))]
+        => ((1, ((1, Ca), (1, C), (3, O))), (6, ((2, H), (1, O))))
+
+        Example CaCO3(H20)6: [[None, ((1, Ca), (1, C), (3, O), (6, formula('H2O')))]
+        => ((1, Ca), (1, C), (3, O), (6, formula('H2O')))
+        """
+        # print("in composite", tokens)
+        numbers = [1 if v is None else v for v in tokens[::2]]
+        groups = tokens[1::2]
+        pairs = tuple((number, group) for number, group in zip(numbers, groups))
+        return pairs
+
+    def fasta(self, tokens):
+        """
+        Returns the formula corresponding to the FASTA sequence, with the natural
+        density set. Labile hydrogen use H[1] in the formula.
+
+        The extra level of nesting in the return value is so that the fasta structure
+        is like a composite with a single group containing a nested formula.
+
+        Transform: [ /aa|dna|rna/, /[A-Z -*]+/ ] => (1, ((1, formula),))
+
+        Example dna:CAGT: ['dna', 'CAGT'] => (1, ((1, C39H37H[1]10N15O25P4@1.69),))
+        """
+        # TODO: fasta is ignoring table when parsing
+        # TODO: avoid circular imports
+        # TODO: support other biochemicals (carbohydrate residues, lipids)
+        from periodictable import fasta
+
+        # print("in fasta", tokens)
+        seq_type, seq = tokens
+        if seq_type not in fasta.CODE_TABLES:
+            raise ValueError(f"Invalid fasta sequence type '{seq_type}:'")
+        seq = fasta.Sequence(name=None, sequence=seq, type=seq_type)
+        group = ((1, seq.labile_formula),)
+        composite = ((1, group),)
+        return composite
+
+    def density(self, tokens):
+        """
+        Returns a density tuple from the @density construct. Density mode 'n' for
+        natural or 'i' for isotopic defaults to isotopic. That is, D2O@1.11 is the
+        isotopic density of D2O, not the natural density of H2O with conversion to
+        the heavier deutrium isotope.
+
+        Transform: [value, mode|None] => ('density', value, mode)
+
+        Example @1.11: [1.11, None] => ('density', 1.11, 'i')
+
+        Example @1.11i: [1.11, 'i'] => ('density', 1.11, 'i')
+
+        Example @1n: [1, 'n'] => ('density', 1, 'n')
+        """
+        value = tokens[0]
+        mode = 'i' if not tokens[1] else tokens[1]
+        return 'density', value, mode
+
+    def compound(self, tokens):
+        """
+        Returns the formula for the compound, with optional density set.
+
+        Density is ('density', value, mode) or None, where mode is 'i' for isotopic density
+        or 'n' for natural density.
+
+        The compound may come from a FASTA spec, such as dna:CAGT or from a composite, such
+        as CaCO3+6H2O. The composite may include an embedded formula, such as CaCO3(H2O)6.
+        In any case, the resulting material token will be a sequence of (multiplier, group)
+        pairs, where each group is a sequence of (count, item) pairs. Each item may be an
+        atom or a formula. The fasta transform returns a single group with a single item.
+        As a nested sequence this is ((1, ((1, formula), ...)), ...), with nothing in the
+        ellipses.
+
+        Transform: [((number, group), ...), ('density', value, mode)|None] => formula
+
+        Example NaCl@2.16i: [(1, ((1, Na), (1, Cl))), ('density', 2.16, 'i')] => NaCl@2.16i
+
+        Example dna:CAGT: [((1, ((1, C39H37H[1]10N15O25P4@1.69n),)),), None] => C39H37H[1]10N15O25P4@1.69n
+
+        Example CaCO3 6H2O: [((1, ((1, Ca), (1, C), (3, O))), (6, ((2, H), (1, O)))), None] => CaCO3(H2O)6
+
+        Example CaCO3(H20)6: [((1, ((1, Ca), (1, C), (3, O), (6, H2O@None))),), None] => CaCO3(H2O)6
+        """
+        # print("in compound with", tokens)
+        components, density_tuple = tokens
+        if density_tuple is None:
+            density, density_mode = None, 'i'
+        else:
+            _, density, density_mode = density_tuple
+
+        # If a singleton formula with no density override then return it
+        # That is, [(1, ((1, formula),)), None] => formula
+        if density is None and len(components) == 1:
+            number, group = components[0]
+            if len(group) == 1 and number == 1:
+                count, item = group[0]
+                if count == 1 and isinstance(item, Formula):
+                    # print("isolated formula with no density override")
+                    return item
+
+        # Not an isolated formula, so expand formulas within the groups.
+        # That is, [..., (number, (..., (count, formula), ...)), ...]
+        # becomes [..., (number, (..., (count, formula.structure), ...)), ...]
+        def expand_formula(group):
+            return tuple((count, getattr(item, 'structure', item)) for count, item in group)
+        components = tuple((number, expand_formula(group)) for number, group in components)
+
+        # If it is a singleton group then use its structure as the formula structure.
+        if len(components) == 1 and components[0][0] == 1:
+            structure = components[0][1]
+        else:
+            structure = components
+
+        # Build the formula and assign density if available.
+        # print("compound structure", structure)
+        formula = Formula(structure=structure)
+        if density is not None:
+            if density_mode == 'n':
+                formula.natural_density = density
+            else:
+                formula.density = density
+
+        # print(f"compound = {formula} @ {formula.density}")
+        return formula
+
+    def weightpct(self, tokens):
+        """
+        Returns the percentage. The value has already be converted to a number.
+
+        Used as the first percentage of a mix by weight mixture.
+
+        Transform: [percent] => percent
+
+        Example for "3 wt%": [3] => 3
+        """
+        return tokens[0]
+
+    def volumepct(self, tokens):
+        """
+        Returns the percentage. The value has already be converted to a number.
+
+        Used as the first percentage of a mix by volume mixture.
+
+        Transform: [percent] => percent
+
+        Example for "3 vol%": [3] => 3
+        """
+        return tokens[0]
+
+    def percentage(self, tokens):
+        """
+        Returns the percentage. The value has already be converted to a number.
+
+        Transform: [percent] => percent
+
+        Example for " 3 % ": [3] => 3
+        """
+        return tokens[0]
+
+    def byweight(self, tokens):
+        """
+        Returns mixture by wt% of the various components in the system.
+
+        Raises ValueError if total exceeds 100%.
+
+        Transform: [percent, formula, ..., percent, formula, formula] => formula
+
+        Example: [76.95, D2O, H2O] => (D2O)3H2O
+        """
+        total = sum(tokens[:-1:2])
+        if total > 100:
+            raise ValueError(f"Total weight {total}% is more than 100%")
+        pairs = [(compound, percent) for percent, compound in zip(tokens[:-1:2], tokens[1:-1:2])]
+        pairs.append((tokens[-1], 100-total))
+        # return 'byweight', [*pairs, last_pair]
+        formula = _mix_by_weight_pairs(pairs)
+        # print(f"byweight => {formula} @ {formula.density}")
+        return formula
+
+    def byvolume(self, tokens):
+        """
+        Returns mixture by vol% of the various components in the system. Volumes are converted
+        to mass using density.
+
+        Raises ValueError if the density is missing from a component formula.
+        Raises ValueError if total exceeds 100%.
+
+        Transform: [percent, formula, ..., percent, formula, formula] => formula
+
+        Example: [75.0, D2O@1n, H2O@1n] => (D2O)3H2O
+        """
+        # print("by volume", tokens)
+        total = sum(tokens[:-1:2])
+        if total > 100:
+            raise ValueError(f"Total volume {total}% is more than 100%")
+        pairs = [(compound, percent) for percent, compound in zip(tokens[:-1:2], tokens[1:-1:2])]
+        pairs.append((tokens[-1], 100-total))
+        # print("byvolume pairs", pairs)
+        # print("byvolume density", [f.density for f, p in pairs])
+        #return 'byvolume', pairs
+        formula = _mix_by_volume_pairs(pairs)
+        return formula
+
+    def byamount(self, tokens):
+        """
+        Returns mixture by mass of the various components in the system. Volumes are converted
+        to mass using density.
+
+        Raises ValueError if the density is missing from a component formula.
+
+        Transform: [quantity, formula, ...] => formula
+
+        Example: [('mass', 5.07, 'g'), NaCl@2.16, ('volume', 50, 'mL'), H2O@1n] => NaCl(H2O)32
+        """
+        # print("byamount", tokens)
+        def find_value(quantity, formula):
+            qtype, value, units = quantity
+            if qtype == 'volume':
+                if formula.density is None:
+                    raise ValueError(f"Need the mass density of {formula}")
+                mass = value * VOLUME_UNITS[units] * 1000.0 * formula.density
+            else:
+                mass = value * MASS_UNITS[units]
+            return mass
+        values = [find_value(q, f) for q, f in zip(tokens[::2], tokens[1::2])]
+        total = sum(values)
+        percent = [(m/total)*100 for m in values]
+        formula = _mix_by_weight_pairs(zip(tokens[1::2], percent))
+        formula.total_mass = total
+        return formula
+
+    def layers(self, tokens):
+        """
+        Returns the mixture by volume of the various layers in the system.
+
+        Raises ValueError if the density is missing from a component formula.
+
+        Sets formula.thickness to the sum of the layer thicknesses.
+
+        Transform: [quantity, formula, ...] => formula
+
+        Example: [('length', 10.006, 'nm'), Ni, ('length', 3, 'mm'), Si] => NiSi164000
+        """
+        values = [value*LENGTH_UNITS[units] for dim, value, units in tokens[::2]]
+        total = sum(values)
+        percent = [(m/total)*100 for m in values]
+        formula = _mix_by_volume_pairs(zip(tokens[1::2], percent))
+        formula.thickness = total
+        return formula
+
+    def mixture(self, tokens):
+        """
+        Returns the formula representing the mixture, either byweight, byvolume, byamount or layers
+
+        Transform: [formula] => formula
+        """
+        return tokens[0]
+
+    def formula(self, tokens):
+        """
+        Return the formula representing the compound or mixture.
+
+        Transform:  [formula] => formula
+        """
+        return tokens[0]
+
+    def thickness(self, tokens):
+        """
+        Returns (dimension, value, unit) with dimension equal 'length'
+
+        Transform: [value, ('length', unit)] => ('length', value, unit)
+
+        Example: [5, ('length', 'nm')] => ('length', 5, 'nm')
+        """
+        value, (dim, units) = tokens
+        return dim, value, units
+
+    def quantity(self, tokens):
+        """
+        Returns (dimension, value, unit) with dimension equal 'mass' or 'volume'
+
+        Transform: [value, (dimension, unit)] => (dimension, value, unit)
+
+        Example: [5, ('mass', 'g')] => ('mass', 5, 'g')
+        """
+        value, (dim, units) = tokens
+        return dim, value, units
+
+    def start(self, tokens):
+        """
+        Return the final formula, with the original text attached.
+
+        Sets formula.source to 'parse string' before returning.
+
+        Transform: [formula] => formula
+        """
+        formula = tokens[0]
+        # TODO: add the source string to the formula class attributes
+        # Remember the string which was parsed
+        formula.source = self._context
+        return formula
+
+
+def parse_formula(formula_str: str, table: PeriodicTable|None=None) -> Formula:
+    """
+    Parse a chemical formula, returning a structure with elements from the
+    given periodic table.
+    """
+    cleanup = StripJunk()
+    convert = ConvertTokens(formula_str, table=table)
+    tree = parser(formula_str)
+    tree = cleanup.transform(tree)
+    tree = convert.transform(tree)
+    return tree
+
+examples = """
+Co
+dna:CAGT
+(Co@5)
+(((Co@5)@6))
+CaCO3
+CaCO₃
+CaCO3+6H2O
+CaCO3 6H2O
+CaCO3(H2O)6
+CaCO3 (H2O)6
+(Ca(CO3)((H2O)6))
+CaCO₃·6H₂O
+DHO
+!Ca{2++}  # could be interpreted as Ca{2+}
+Ca⁺⁺  # also Ca{2+}
+O²⁻
+H[1]
+H2O@1
+D2O@1n
+D2O @ 1.11
+D2O@1.11i
+HO{1-}
+H[1]{1-}O
+H2SO4
+C3H4H[1]NO@1.29n
+78.2H2O[16] + 21.8H2O[18] @1n
+50 wt% Co // Ti
+33 wt% Co // 33% Fe // Ti
+! 93 wt% Co // 33% Fe // Ti  # More than 100%
+20 vol% (10 wt% NaCl@2.16 // H2O@1) // D2O@1n
+NaCl(H2O)29.1966(D2O)122.794@1.10i
+5g NaCl // 50mL H2O@1
+5g NaCl@2.16 // 50mL H2O@1
+50 mL (45 mL H2O@1 // 5 g NaCl)@1.0707 // 20 mL D2O@1n
+1 cm Si // 5 nm Cr // 10 nm Au
+aa:RELEELNVPGEIVESLSSSEESITRINKKIEKFQSEEQQQTEDELQDKIHPFAQTQSLVYPFPGPIPNSLPQNIPPLTQTPVVVPPFLQPEVMGVSKVKEAMAPKHKEMPFPKYPVEPFTESQSLTLTDVENLHLPLPLLQSWMHQPHQPLPPTVMFPPQSVLSLSQSKVLPVPQKAVPYPQRDMPIQAFLLYQEPVLGPVRGPFPIIV
+
+# Error conditions. Mark with '!' so the exception is ignored
+! Bl2Oh
+! 5 Mg NaCl // 50mL H2O@1
+! 4 nm NaCl@2.17// 50 g Si
+
+"""
+
+def check():
+    cleanup = StripJunk()
+    def filt(tree):
+        #return tree
+        tree = cleanup.transform(tree)
+        # import pprint; pprint.pprint(tree)
+        tree = convert.transform(tree)
+        return tree
+
+    for line in examples.split('\n'):
+        formula = line.split('#')[0]
+        bad = formula.startswith('!')
+        if bad:
+            formula = formula[1:]
+        if formula:
+            print(f"*** {line}")
+            convert = ConvertTokens(text=formula)
+            try:
+                tree = filt(parser.parse(formula))
+                #print(f" => {tree.pretty()}")
+                density = getattr(tree, 'density', None)
+                density_str = f" @ {density:.2f}" if density else ""
+                print(f" => {tree}{density_str}")
+                # TODO: structure not preserved in mixtures
+                print(f"    {getattr(tree, 'structure', None)}")
+            except Exception as exc:
+                if bad:
+                    print(f"!!! Error: {exc}")
+                else:
+                    raise
+
+if __name__ == "__main__":
+    check()
\ No newline at end of file

From e2b5aec25e4dbbe9e10c22dfa0e9c03de18f9bf7 Mon Sep 17 00:00:00 2001
From: Paul Kienzle <pkienzle@nist.gov>
Date: Thu, 5 Mar 2026 22:21:58 -0500
Subject: [PATCH 02/19] improve error display for lark parser

---
 explore/lark_parse.py | 180 ++++++++++++++++++++++++++++++++++--------
 1 file changed, 146 insertions(+), 34 deletions(-)

diff --git a/explore/lark_parse.py b/explore/lark_parse.py
index aea2724..d773dcc 100644
--- a/explore/lark_parse.py
+++ b/explore/lark_parse.py
@@ -36,7 +36,8 @@
 # Note: `[token]` leaves a None placeholder in the tree, unlike `token?`
 compound   : (composite | fasta) [density]
 fasta      : FASTA ":" SEQUENCE
-FASTA      : /dna|rna|aa/
+FASTA      : /[a-z]+/ # Generic "str:sequence" syntax allows better error reporting
+#FASTA     : /dna|rna|aa/
 SEQUENCE   : /[A-Z -*]+/
 composite  : [NUMBER] group (SEPARATOR [NUMBER] group)*
 group      : ((atom | "(" formula ")") [COUNT])+
@@ -45,10 +46,12 @@
 SYMBOL     : /[A-Z][a-z]*/
 isotope    : "[" INTEGER "]"
 charge     : "{" [INTEGER] CHARGE "}" | [SUPERINT] SUPERCHARGE
-density    : SPACE? "@" SPACE? NUMBER [DENSITYMODE]
+density    : SPACE? "@" SPACE? DENSITY [DENSITYMODE]
+DENSITY    : NUMBER  # using alias DENSITY for number for better error reporting
 
 # Tokens
 CHARGE     : /[+]+|[-]+/  # allow charge using {++} or {--}
+SUPERINT   : /(\u2070|[\u00B9\u00B2\u00B3\u2074-\u2079][\u2070\u00B9\u00B2\u00B3\u2074-\u2079]*)/
 SUPERCHARGE: /\u207A+|\u207B+/  # Allow Ca++ and Cl- using superscript + and -
 DENSITYMODE: /[ni]/
 MIX        : SPACE? "//" SPACE?
@@ -57,7 +60,7 @@
 VOLUMEPCT  : /%v(ol(ume)?)?/ | /v(ol(ume)?)?%/
 MASS       : "kg" | "g" | "mg" | "ug" | "μg" | "ng"
 VOLUME     : "L" | "mL" | "uL" | "μL" | "nL"
-LENGTH     : "cm" | "mm" | "um" | "μm" | "nm"
+LENGTH     : "cm" | "mm" | "um" | "μm" | "nm" | "Ang" | "Å"
 
 SEPARATOR  : SPACE? /[+•·]/ SPACE? | SPACE
 SPACE      : /[ \\t\\n\\r]+/
@@ -68,10 +71,10 @@
 SUBNUM     : SUBINT | SUBFRAC
 SUBINT     : /(\u2080|[\u2081-\u2089][\u2080-\u2089]*)/
 SUBFRAC    : /(\u2080|[\u2081-\u2089][\u2080-\u2089]*|)([.][\u2080-\u2089]*)/
-SUPERINT   : /(\u2070|[\u00B9\u00B2\u00B3\u2074-\u2079][\u2070\u00B9\u00B2\u00B3\u2074-\u2079]*)/
 """
 
-parser = lark.Lark(grammar)
+# propagate_positions saves start_pos and end_pos for each rule as well as each terminal.
+formula_parser = lark.Lark(grammar, propagate_positions=True)
 
 def from_superscript(value: str) -> str:
     """
@@ -99,6 +102,9 @@ class StripJunk(lark.Transformer):
 
     This is done separately from the formula composer so that we can show the cleaned tree
     before debugging the conversion.
+
+    Note: could get the same effect by renaming the unused terminals with leading underscore,
+    but that makes the grammar harder to read.
     """
     def SEPARATOR(self, _):
         """Strip token for molecular fragment separator (+ or center dot or spaces)."""
@@ -158,6 +164,7 @@ def NUMBER(self, token: lark.Token) -> int|float:
         counts to molecule fragments.
         """
         return int_or_float(token.value)
+    DENSITY = NUMBER  # We've aliased DENSITY and NUMBER in the grammar
     def INTEGER(self, token: lark.Token) -> int:
         """
         Convert string to float or integer
@@ -257,12 +264,12 @@ def charge(self, tokens):
         # Ca{++} => [None, '++'] = Ca.ion[2]
         # Ca{3--} => [3, '--'] = Ca.ion[-3]  # value has precedence over charge
         """
-        print("in charge with", tokens)
+        # print("in charge with", tokens)
         value, charge = tokens
         if value is None:
             value = len(charge)
         elif value and len(charge) > 1:
-            self._raise_error(None, f"Using values of {value} for {value}{charge}")
+            raise ValueError(f"Use {value}{charge[0]} instead of {value}{charge} for valence")
         valence = value if charge[0] == '+' else -value
         return valence
     def atom(self, tokens):
@@ -486,9 +493,10 @@ def byweight(self, tokens):
 
         Example: [76.95, D2O, H2O] => (D2O)3H2O
         """
+        # TODO: structure not preserved in mixtures
         total = sum(tokens[:-1:2])
         if total > 100:
-            raise ValueError(f"Total weight {total}% is more than 100%")
+            raise ValueError(f"Total weight {total}% is more than 100% in wt% mixture")
         pairs = [(compound, percent) for percent, compound in zip(tokens[:-1:2], tokens[1:-1:2])]
         pairs.append((tokens[-1], 100-total))
         # return 'byweight', [*pairs, last_pair]
@@ -511,7 +519,7 @@ def byvolume(self, tokens):
         # print("by volume", tokens)
         total = sum(tokens[:-1:2])
         if total > 100:
-            raise ValueError(f"Total volume {total}% is more than 100%")
+            raise ValueError(f"Total volume {total}% is more than 100% in vol% mixture")
         pairs = [(compound, percent) for percent, compound in zip(tokens[:-1:2], tokens[1:-1:2])]
         pairs.append((tokens[-1], 100-total))
         # print("byvolume pairs", pairs)
@@ -560,6 +568,9 @@ def layers(self, tokens):
 
         Example: [('length', 10.006, 'nm'), Ni, ('length', 3, 'mm'), Si] => NiSi164000
         """
+        # # Sanity check: make sure all units are length units. This won't happen
+        # # because the parser only accepts proper formulas.
+        # assert all(units in LENGTH_UNITS for dim, value, units in tokens[::2])
         values = [value*LENGTH_UNITS[units] for dim, value, units in tokens[::2]]
         total = sum(values)
         percent = [(m/total)*100 for m in values]
@@ -619,6 +630,61 @@ def start(self, tokens):
         formula.source = self._context
         return formula
 
+# TODO: improve error reporting for "allowed"
+def _allowed(allowed):
+    # * SPACE, SEPARATOR: Generally ignored
+    # * LPAR occurs whereever a symbol could be expected, so skip it
+    # * COLON: If asking then it probably thinks it is looking for a fasta sequence, but
+    # instead it should be looking for an element, so replace COLON with SYMBOL.
+    # * AT: Looking for @DENSITY
+    # * LPAR, RPAR: "(" and ")" are more readable
+    # * LSQB: end of element, looking for isotope, so skip
+    # * LBRACE, SUPERINT, SUPERCHARGE: end of element, looking for valence, so skip
+    skip = set("SPACE SEPARATOR LPAR LSQB LBRACE SUPERINT SUPERCHARGE".split())
+    # TODO: use order of elements in subst to sort the allowed list (currently alphabetical)
+    subst = dict(
+        NUMBER="NUMBER", # start of compound or start of mixture
+        #FASTA="[dna|rna|aa]:SEQ",
+        FASTA="aa:SEQ",
+        COLON="aa:SEQ",
+        SEQUENCE="aa:SEQ",
+        SEPARATOR="+", # generic group separator in composite
+        SPACE="SPACE",
+        SYMBOL="SYMBOL",
+        CHARGE="CHARGE[+-]",
+        LPAR='(',
+        RPAR=')',
+        LSQB='[',
+        RSQB=']',
+        LBRACE='{', # equivalent to SUPERINT and SUPERCHARGE
+        RBRACE='}',
+        VOLUMEPCT="vol%",
+        WEIGHTPCT="wt%",
+        MASS="UNIT[mg]",
+        VOLUME="UNIT[mL]",
+        LENGTH="UNIT[mm]",
+        PERCENT="%",
+        # I don't think all three of these can be concurrently allowed so no need to
+        # deduplicate. Moot since the set operation happens again after substition below.
+        AT="@DENSITY[ni]", # only the @ is expected, but better for doc
+        DENSITY="@DENSITY[ni]", # only the number is expected, but better for doc
+        DENSITYMODE="@DENSITY[ni]", # only the [ni] is expected, but better for doc
+        MIX="//",
+        # SUBNUM SUBINT SUBFRAC covered by COUNT
+        # INTEGER and FRACTION covered by NUMBER
+        # SUPERINT SUPERCHARGE LSQB LBRACE coexist with COUNT so stripped
+        SUPERCHARGE="SUPERSCRIPT[+-]", # If you see a superscript number then you need a sign
+        )
+    stripped = set(s for s in allowed if s not in skip)
+    if not stripped:
+        stripped = allowed
+    # Perform substitution for document strings
+    stripped = set(subst.get(s, s) for s in stripped)
+    if len(stripped) > 1:
+        message = f"one of {' '.join(sorted(stripped))}"
+    else:
+        message = [*stripped][0]
+    return message
 
 def parse_formula(formula_str: str, table: PeriodicTable|None=None) -> Formula:
     """
@@ -627,12 +693,62 @@ def parse_formula(formula_str: str, table: PeriodicTable|None=None) -> Formula:
     """
     cleanup = StripJunk()
     convert = ConvertTokens(formula_str, table=table)
-    tree = parser(formula_str)
+    try:
+        tree = formula_parser.parse(formula_str)
+    except lark.exceptions.UnexpectedCharacters as exc:
+        # import pprint; pprint.pprint(exc.__dict__)
+        context = exc.get_context(formula_str).rstrip()
+        #context = exc._context.rstrip()
+        message = f"Expected {_allowed(exc.allowed)} in\n{context}"
+        raise ValueError(message)
+    except lark.exceptions.UnexpectedEOF as exc:
+        # import pprint; pprint.pprint(exc.__dict__)
+        context = exc.get_context(formula_str).rstrip()
+        message = f"Expected {_allowed(exc.expected)} in\n{context}"
+        raise ValueError(message)
+    except Exception as exc:
+        # TODO: are other exceptions possible from the Earley parser?
+        raise exc from None
     tree = cleanup.transform(tree)
-    tree = convert.transform(tree)
+    try:
+        tree = convert.transform(tree)
+    except lark.exceptions.VisitError as exc:
+        # Unwind the VistorError exception capture and reraise the original exception
+        # This requires that error messages in the transformer give enough context to
+        # correct the error.
+        raise exc.orig_exc from None
     return tree
 
+# Error conditions are marked with '!' so the exception is ignored
 examples = """
+! DNA:CAGT  # incorrect case for FASTA type not properly identified
+! dna CAGT  # missing colon in FASTA
+! O²  # SUPERCHARGE should be the only valid token here
+! ₃H2O  # badly placed subscript
+! // 3g Ca  # // is not a comment
+! 3g Ca@ // 5g Si # missing density value
+! Ca@i  # missing density value
+! Ca ⁺⁺  # extra space before valence
+! Ca++  # missing braces in valence
+! Ca{2}  # missing charge in valence
+! 37 vol% H2O@1 / 5% D2O@1  # missing /
+! 37 vol% H2O@1 /// 5% D2O@1  # extra /
+! H2O@1h  # bad density mode
+! 37 vol% H2O@1 // 5% D2O@1  # no percent in last part
+! 37 vol% H2O@1 // 5 vol% D2O@1  # only % in subsequent parts
+! 37% H2O@1 // D2O@1  # missing vol% or wt%
+! 37 val% H2O@1 // D2O@1  # bad spelling of vol%
+! Fe[56O2 # bad isotope syntax
+! Co[181]  # bad isotope
+! Ca{2+O2  # bad valence syntax
+! Co{17-}  # bad valence
+! 3..5 mg NaCl
+! 3.5 fm Si # bad units at the start; could be wt%/vol% or LENGTH, VOLUME, MASS 
+! 3.5 mm Si // 2.5 nm SiO2 //
+! 3.5 mm Si // 2.5 nm SiO2 // 35 mm cG
+! ((Co) # mismatched LPAR
+! Co)  # mismatched RPAR
+! bad:CAGT  # bad sequence type
 Co
 dna:CAGT
 (Co@5)
@@ -646,7 +762,7 @@ def parse_formula(formula_str: str, table: PeriodicTable|None=None) -> Formula:
 (Ca(CO3)((H2O)6))
 CaCO₃·6H₂O
 DHO
-!Ca{2++}  # could be interpreted as Ca{2+}
+!Ca{2++}  # bad valence string
 Ca⁺⁺  # also Ca{2+}
 O²⁻
 H[1]
@@ -661,52 +777,48 @@ def parse_formula(formula_str: str, table: PeriodicTable|None=None) -> Formula:
 78.2H2O[16] + 21.8H2O[18] @1n
 50 wt% Co // Ti
 33 wt% Co // 33% Fe // Ti
-! 93 wt% Co // 33% Fe // Ti  # More than 100%
+! 93 wt% Co // 33% Fe // Ti  # More than 100 wt%
+! 93 vol% Co // 33% Fe // Ti  # More than 100 vol%
 20 vol% (10 wt% NaCl@2.16 // H2O@1) // D2O@1n
 NaCl(H2O)29.1966(D2O)122.794@1.10i
 5g NaCl // 50mL H2O@1
 5g NaCl@2.16 // 50mL H2O@1
+! 5g NaCl // 50mL H2O   # Need density for H2O to convert volume to mass
 50 mL (45 mL H2O@1 // 5 g NaCl)@1.0707 // 20 mL D2O@1n
 1 cm Si // 5 nm Cr // 10 nm Au
 aa:RELEELNVPGEIVESLSSSEESITRINKKIEKFQSEEQQQTEDELQDKIHPFAQTQSLVYPFPGPIPNSLPQNIPPLTQTPVVVPPFLQPEVMGVSKVKEAMAPKHKEMPFPKYPVEPFTESQSLTLTDVENLHLPLPLLQSWMHQPHQPLPPTVMFPPQSVLSLSQSKVLPVPQKAVPYPQRDMPIQAFLLYQEPVLGPVRGPFPIIV
 
-# Error conditions. Mark with '!' so the exception is ignored
-! Bl2Oh
-! 5 Mg NaCl // 50mL H2O@1
-! 4 nm NaCl@2.17// 50 g Si
+! Bl2Oh   # Bad symbol
+! 5 Mg NaCl // 50mL H2O@1  # Bad units
+! 4 nm NaCl@2.17// 50 g Si  # Can't use mass in layer mixture
 
 """
 
 def check():
-    cleanup = StripJunk()
-    def filt(tree):
-        #return tree
-        tree = cleanup.transform(tree)
-        # import pprint; pprint.pprint(tree)
-        tree = convert.transform(tree)
-        return tree
-
     for line in examples.split('\n'):
         formula = line.split('#')[0]
-        bad = formula.startswith('!')
+        bad = line.startswith('!')
         if bad:
             formula = formula[1:]
         if formula:
-            print(f"*** {line}")
-            convert = ConvertTokens(text=formula)
+            if bad:
+                print(f"!!! {line[1:]}")
+            else:
+                print(f"*** {line}")
             try:
-                tree = filt(parser.parse(formula))
-                #print(f" => {tree.pretty()}")
+                tree = parse_formula(formula)
                 density = getattr(tree, 'density', None)
                 density_str = f" @ {density:.2f}" if density else ""
                 print(f" => {tree}{density_str}")
-                # TODO: structure not preserved in mixtures
-                print(f"    {getattr(tree, 'structure', None)}")
+                # print(f"    {getattr(tree, 'structure', None)}")
             except Exception as exc:
                 if bad:
-                    print(f"!!! Error: {exc}")
+                    print(f"{exc}")
                 else:
-                    raise
+                    raise exc from None
+            else:
+                if bad:
+                    raise RuntimeError(f"Exception not raised for <{formula}>")
 
 if __name__ == "__main__":
     check()
\ No newline at end of file

From c3c0ced7ea5c29ed1428a0e093db4744b64b3226 Mon Sep 17 00:00:00 2001
From: Paul Kienzle <pkienzle@nist.gov>
Date: Thu, 5 Mar 2026 22:22:31 -0500
Subject: [PATCH 03/19] better formatting of invalid valence error

---
 periodictable/core.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/periodictable/core.py b/periodictable/core.py
index 534a258..9c8e496 100644
--- a/periodictable/core.py
+++ b/periodictable/core.py
@@ -398,9 +398,9 @@ def __init__(self, element_or_isotope: Element|Isotope):
     def __getitem__(self, charge: int) -> Ion:
         if charge not in self.ionset:
             if charge not in self.element_or_isotope.ions:
-                raise ValueError("%(charge)d is not a valid charge for %(symbol)s"
-                                 % dict(charge=charge,
-                                        symbol=self.element_or_isotope.symbol))
+                valence = f"{abs(charge)}{'+' if charge > 0 else '-'}"
+                symbol = self.element_or_isotope.symbol
+                raise ValueError(f"valence {valence} is not valid for {symbol}")
             self.ionset[charge] = Ion(self.element_or_isotope, charge)
         return self.ionset[charge]
 

From a31ce4649381197995d6df382dbabf0c11add9c1 Mon Sep 17 00:00:00 2001
From: Paul Kienzle <pkienzle@nist.gov>
Date: Thu, 5 Mar 2026 22:23:19 -0500
Subject: [PATCH 04/19] Allow Ang when defining layered samples

---
 periodictable/formulas.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/periodictable/formulas.py b/periodictable/formulas.py
index f731c93..adfe916 100644
--- a/periodictable/formulas.py
+++ b/periodictable/formulas.py
@@ -712,7 +712,7 @@ def _isotope_substitution(compound: "Formula", source: Atom, target: Atom, porti
 
 # TODO: Grammar should be independent of table
 # TODO: Parser can't handle meters as 'm' because it conflicts with the milli prefix
-LENGTH_UNITS = {'nm': 1e-9, 'um': 1e-6, 'μm': 1e-6, 'mm': 1e-3, 'cm': 1e-2}
+LENGTH_UNITS = {'nm': 1e-9, 'um': 1e-6, 'μm': 1e-6, 'mm': 1e-3, 'cm': 1e-2, 'Ang': 1e-10, 'Å': 1e-10}
 MASS_UNITS = {'ng': 1e-9, 'ug': 1e-6, 'mg': 1e-3, 'g': 1e+0, 'kg': 1e+3}
 VOLUME_UNITS = {'nL': 1e-9, 'uL': 1e-6, 'mL': 1e-3, 'L': 1e+0}
 LENGTH_RE = '('+'|'.join(LENGTH_UNITS.keys())+')'

From 13901325e8cef108ea189a23648ed6eb2a827304 Mon Sep 17 00:00:00 2001
From: Paul Kienzle <pkienzle@nist.gov>
Date: Thu, 5 Mar 2026 23:50:38 -0500
Subject: [PATCH 05/19] check failure text in pyparsing

---
 explore/lark_parse.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/explore/lark_parse.py b/explore/lark_parse.py
index d773dcc..cb226bd 100644
--- a/explore/lark_parse.py
+++ b/explore/lark_parse.py
@@ -720,6 +720,7 @@ def parse_formula(formula_str: str, table: PeriodicTable|None=None) -> Formula:
     return tree
 
 # Error conditions are marked with '!' so the exception is ignored
+# Lines marked ## fail on the existing parser
 examples = """
 ! DNA:CAGT  # incorrect case for FASTA type not properly identified
 ! dna CAGT  # missing colon in FASTA
@@ -727,7 +728,7 @@ def parse_formula(formula_str: str, table: PeriodicTable|None=None) -> Formula:
 ! ₃H2O  # badly placed subscript
 ! // 3g Ca  # // is not a comment
 ! 3g Ca@ // 5g Si # missing density value
-! Ca@i  # missing density value
+! Ca@i  # missing density value  ##
 ! Ca ⁺⁺  # extra space before valence
 ! Ca++  # missing braces in valence
 ! Ca{2}  # missing charge in valence
@@ -751,8 +752,8 @@ def parse_formula(formula_str: str, table: PeriodicTable|None=None) -> Formula:
 ! bad:CAGT  # bad sequence type
 Co
 dna:CAGT
-(Co@5)
-(((Co@5)@6))
+(Co@5) ##
+(((Co@5)@6)) ##
 CaCO3
 CaCO₃
 CaCO3+6H2O
@@ -760,15 +761,15 @@ def parse_formula(formula_str: str, table: PeriodicTable|None=None) -> Formula:
 CaCO3(H2O)6
 CaCO3 (H2O)6
 (Ca(CO3)((H2O)6))
-CaCO₃·6H₂O
+CaCO₃·6H₂O  ##
 DHO
 !Ca{2++}  # bad valence string
-Ca⁺⁺  # also Ca{2+}
-O²⁻
+Ca⁺⁺  # also Ca{2+}  ##
+O²⁻   ##
 H[1]
 H2O@1
 D2O@1n
-D2O @ 1.11
+D2O @ 1.11  ##
 D2O@1.11i
 HO{1-}
 H[1]{1-}O
@@ -806,7 +807,9 @@ def check():
             else:
                 print(f"*** {line}")
             try:
+                # Toggle the following to test pyparsing vs lark
                 tree = parse_formula(formula)
+                #tree = pt.formula(formula) if "##" not in line else "!!! pyparsing fails"
                 density = getattr(tree, 'density', None)
                 density_str = f" @ {density:.2f}" if density else ""
                 print(f" => {tree}{density_str}")
@@ -817,6 +820,7 @@ def check():
                 else:
                     raise exc from None
             else:
+                if '##' in line: continue  # pyparsing should fail but doesn't
                 if bad:
                     raise RuntimeError(f"Exception not raised for <{formula}>")
 

From ee2fc986e6e07a0fb613c997571746b2cd51d4ce Mon Sep 17 00:00:00 2001
From: Paul Kienzle <pkienzle@nist.gov>
Date: Fri, 6 Mar 2026 14:19:00 -0500
Subject: [PATCH 06/19] display and parse unicode superscripts for isotopes

---
 explore/lark_parse.py     | 112 +++++++++++++++++++------------
 periodictable/formulas.py | 134 ++++++++++++++++++++++++++------------
 2 files changed, 162 insertions(+), 84 deletions(-)

diff --git a/explore/lark_parse.py b/explore/lark_parse.py
index cb226bd..40ae090 100644
--- a/explore/lark_parse.py
+++ b/explore/lark_parse.py
@@ -2,8 +2,15 @@
 import periodictable as pt
 from periodictable.core import PeriodicTable
 from periodictable.core import default_table
-from periodictable.formulas import from_subscript, Formula, _mix_by_weight_pairs, _mix_by_volume_pairs
-from periodictable.formulas import VOLUME_UNITS, MASS_UNITS, LENGTH_UNITS
+from periodictable.formulas import (
+    from_subscript, from_superscript,
+    Formula,
+    _mix_by_weight_pairs, _mix_by_volume_pairs,
+    VOLUME_UNITS, MASS_UNITS, LENGTH_UNITS,
+    pretty as pretty_formula
+)
+
+# TODO: valence belongs to a group rather than element
 
 grammar = """
 start      : SPACE? formula SPACE? # strip blank space from start and end
@@ -40,17 +47,18 @@
 #FASTA     : /dna|rna|aa/
 SEQUENCE   : /[A-Z -*]+/
 composite  : [NUMBER] group (SEPARATOR [NUMBER] group)*
-group      : ((atom | "(" formula ")") [COUNT])+
-atom       : SYMBOL [isotope] [charge]
+group      : ((atom | isoatom | "(" formula ")") [COUNT])+
+atom       : SYMBOL [isotope] [valence]
+isoatom    : SUPERINT SYMBOL [valence]
 # could list all elements, but better error reporting if element symbol lookup fails
 SYMBOL     : /[A-Z][a-z]*/
 isotope    : "[" INTEGER "]"
-charge     : "{" [INTEGER] CHARGE "}" | [SUPERINT] SUPERCHARGE
+valence    : "{" [INTEGER] CHARGE "}" | [SUPERINT] SUPERCHARGE
 density    : SPACE? "@" SPACE? DENSITY [DENSITYMODE]
 DENSITY    : NUMBER  # using alias DENSITY for number for better error reporting
 
 # Tokens
-CHARGE     : /[+]+|[-]+/  # allow charge using {++} or {--}
+CHARGE     : /[+]+|[-]+/  # allow valence using {++} or {--}
 SUPERINT   : /(\u2070|[\u00B9\u00B2\u00B3\u2074-\u2079][\u2070\u00B9\u00B2\u00B3\u2074-\u2079]*)/
 SUPERCHARGE: /\u207A+|\u207B+/  # Allow Ca++ and Cl- using superscript + and -
 DENSITYMODE: /[ni]/
@@ -76,21 +84,6 @@
 # propagate_positions saves start_pos and end_pos for each rule as well as each terminal.
 formula_parser = lark.Lark(grammar, propagate_positions=True)
 
-def from_superscript(value: str) -> str:
-    """
-    Convert unicode superscript characters to normal characters. This allows us to parse,
-    for example, Ca²⁺ as Ca{2+}.
-    """
-    codepoints = {
-        '\u2070': '0', '\u00B9': '1', '\u00B2': '2', '\u00B3': '3',
-        '\u2074': '4', '\u2075': '5', '\u2076': '6', '\u2077': '7',
-        '\u2078': '8', '\u2079': '9', '\u207a': '+', '\u207b': '-',
-        '\u207c': '=', '\u207d': '(', '\u207e': ')',
-
-        '\u2071': 'i', '\u207f': 'n',
-    }
-    return ''.join(codepoints.get(char, char) for char in str(value))
-
 def int_or_float(s):
     f = float(s)
     i = int(f)
@@ -184,7 +177,7 @@ def SUPERINT(self, token):
         """
         Return the integer value of a sequence of superscript digits.
 
-        This is used in the charge rule as part of the valence specification for the atom.
+        This is used to specify the valence or to specify the isotope.
         """
         return int(from_superscript(token.value))
     def DENSITYMODE(self, token):
@@ -198,14 +191,14 @@ def CHARGE(self, token):
         Return a sequence of plus and minus characters. By grammar rules they must all have
         the same sign.
 
-        This is used in the charge rule as part of the valence specification for the atom.
+        This is used in the valence rule to specify the charge for the atom.
         """
         return token.value
     def SUPERCHARGE(self, token):
         """
         Convert sequence of superscript plus and minus characters to ASCII plus and minus.
 
-        This is used in the charge rule as part of the valence specification for the atom.
+        This is used in the valence rule to specify the charge for the atom.
         """
         return from_superscript(token.value)
     def SYMBOL(self, token):
@@ -243,7 +236,7 @@ def isotope(self, tokens):
         Transform: [isotope] => isotope
         """
         return tokens[0]
-    def charge(self, tokens):
+    def valence(self, tokens):
         """
         Return valence from number and sign.
 
@@ -261,10 +254,12 @@ def charge(self, tokens):
         Transform: [number|None, 'charge'] => valence
 
         Example: ['{1+}'] => [1, '+'] = Ca.ion[1]
-        # Ca{++} => [None, '++'] = Ca.ion[2]
-        # Ca{3--} => [3, '--'] = Ca.ion[-3]  # value has precedence over charge
+
+        Example: Ca{++} => [None, '++'] = Ca.ion[2]
+
+        Example: Ca{3--} => ValueError
         """
-        # print("in charge with", tokens)
+        # print("in valence with", tokens)
         value, charge = tokens
         if value is None:
             value = len(charge)
@@ -280,15 +275,10 @@ def atom(self, tokens):
         provided to the ConvertTokens constructor then that will be used to retrieve the element
         from the symbol.
 
-        Isotope and charge are optional. By using the rule "SYMBOL [isotope] [charge|supercharge]"
-        with "[opt]" for the optional components rather "opt?", the missing components appear
-        as None in the list of tokens. The "supercharge" option allows unicode superscripts to
-        be used to specify charge rather than curly braces "{charge}".
-
         Raises an error if the symbol does not exist, does not have that isotope or doesn't
-        allow that charge.
+        allow that valence.
 
-        Transform: ['symbol', isotope|None, charge|None] => atom
+        Transform: ['symbol', isotope|None, valence|None] => atom
 
         Example: ['H', 1, 1] => H[1]{+}
 
@@ -307,6 +297,28 @@ def atom(self, tokens):
         #print(f"atom {tokens} => {atom}")
         return atom
 
+    def isoatom(self, tokens):
+        """
+        Returns an isotope from the periodic table.
+
+        Usually this will use elements from the default table, but if an alternate table is
+        provided to the ConvertTokens constructor then that will be used to retrieve the element
+        from the symbol.
+
+        Raises an error if the symbol does not exist, does not have that isotope or doesn't
+        allow that valence.
+
+        Transform: [isotope, 'symbol', valence|None] => atom
+
+        Example ²H⁺: [2, 'H', 1] => D{+}
+        """
+        # print("isoatom", tokens)
+        iso, el, ion = tokens
+        atom = el[iso].ion[ion] if ion else el[iso]
+        # print(f"isoatom {tokens} => {atom}")
+        return atom
+
+
     def group(self, tokens):
         """
         Returns a sequence of (count, item) pairs, where item is an atom or a nested formula.
@@ -630,7 +642,7 @@ def start(self, tokens):
         formula.source = self._context
         return formula
 
-# TODO: improve error reporting for "allowed"
+# TODO: if the next character is ":" then report error as bad fasta sequence type
 def _allowed(allowed):
     # * SPACE, SEPARATOR: Generally ignored
     # * LPAR occurs whereever a symbol could be expected, so skip it
@@ -646,7 +658,8 @@ def _allowed(allowed):
         NUMBER="NUMBER", # start of compound or start of mixture
         #FASTA="[dna|rna|aa]:SEQ",
         FASTA="aa:SEQ",
-        COLON="aa:SEQ",
+        COLON=":",
+        #COLON="aa:SEQ",
         SEQUENCE="aa:SEQ",
         SEPARATOR="+", # generic group separator in composite
         SPACE="SPACE",
@@ -682,8 +695,12 @@ def _allowed(allowed):
     stripped = set(subst.get(s, s) for s in stripped)
     if len(stripped) > 1:
         message = f"one of {' '.join(sorted(stripped))}"
-    else:
+    elif stripped:
         message = [*stripped][0]
+    else:
+        # This occurs when the middle part of percent mixtures have no percentage.
+        # We could look for '//' in the string to report a better error message.
+        message = "end of formula"
     return message
 
 def parse_formula(formula_str: str, table: PeriodicTable|None=None) -> Formula:
@@ -696,7 +713,7 @@ def parse_formula(formula_str: str, table: PeriodicTable|None=None) -> Formula:
     try:
         tree = formula_parser.parse(formula_str)
     except lark.exceptions.UnexpectedCharacters as exc:
-        # import pprint; pprint.pprint(exc.__dict__)
+        #import pprint; pprint.pprint(exc.__dict__)
         context = exc.get_context(formula_str).rstrip()
         #context = exc._context.rstrip()
         message = f"Expected {_allowed(exc.allowed)} in\n{context}"
@@ -730,12 +747,14 @@ def parse_formula(formula_str: str, table: PeriodicTable|None=None) -> Formula:
 ! 3g Ca@ // 5g Si # missing density value
 ! Ca@i  # missing density value  ##
 ! Ca ⁺⁺  # extra space before valence
-! Ca++  # missing braces in valence
+! Ca++  # missing braces in valence: the + is acting as SEPARATOR
+! Ca2+  # missing braces in valence: the 2 is acting as COUNT and the + as SEPARATOR
 ! Ca{2}  # missing charge in valence
 ! 37 vol% H2O@1 / 5% D2O@1  # missing /
 ! 37 vol% H2O@1 /// 5% D2O@1  # extra /
 ! H2O@1h  # bad density mode
-! 37 vol% H2O@1 // 5% D2O@1  # no percent in last part
+! 37 vol% NaCl@2.16 // H2O@1 // D2O@1  # percent missing in middle part
+! 37 vol% H2O@1 // 5% D2O@1  # percent not allowed in last part
 ! 37 vol% H2O@1 // 5 vol% D2O@1  # only % in subsequent parts
 ! 37% H2O@1 // D2O@1  # missing vol% or wt%
 ! 37 val% H2O@1 // D2O@1  # bad spelling of vol%
@@ -767,6 +786,10 @@ def parse_formula(formula_str: str, table: PeriodicTable|None=None) -> Formula:
 Ca⁺⁺  # also Ca{2+}  ##
 O²⁻   ##
 H[1]
+²H⁺    # D{+} ##
+O²H⁻   # OD{-} ##
+O²⁻H⁺  # O{2-}H{+} ##
+O²⁻²H⁺ # O{2-}D{+} ##
 H2O@1
 D2O@1n
 D2O @ 1.11  ##
@@ -775,7 +798,8 @@ def parse_formula(formula_str: str, table: PeriodicTable|None=None) -> Formula:
 H[1]{1-}O
 H2SO4
 C3H4H[1]NO@1.29n
-78.2H2O[16] + 21.8H2O[18] @1n
+78.2H2O[16] + 21.8H2O[18] @1n  # density applies to composite
+dna:CAGT @1n  # fasta density override
 50 wt% Co // Ti
 33 wt% Co // 33% Fe // Ti
 ! 93 wt% Co // 33% Fe // Ti  # More than 100 wt%
@@ -785,6 +809,7 @@ def parse_formula(formula_str: str, table: PeriodicTable|None=None) -> Formula:
 5g NaCl // 50mL H2O@1
 5g NaCl@2.16 // 50mL H2O@1
 ! 5g NaCl // 50mL H2O   # Need density for H2O to convert volume to mass
+(10 wt% NaCl // H2O)@1.07n # set density of a mixture
 50 mL (45 mL H2O@1 // 5 g NaCl)@1.0707 // 20 mL D2O@1n
 1 cm Si // 5 nm Cr // 10 nm Au
 aa:RELEELNVPGEIVESLSSSEESITRINKKIEKFQSEEQQQTEDELQDKIHPFAQTQSLVYPFPGPIPNSLPQNIPPLTQTPVVVPPFLQPEVMGVSKVKEAMAPKHKEMPFPKYPVEPFTESQSLTLTDVENLHLPLPLLQSWMHQPHQPLPPTVMFPPQSVLSLSQSKVLPVPQKAVPYPQRDMPIQAFLLYQEPVLGPVRGPFPIIV
@@ -812,7 +837,8 @@ def check():
                 #tree = pt.formula(formula) if "##" not in line else "!!! pyparsing fails"
                 density = getattr(tree, 'density', None)
                 density_str = f" @ {density:.2f}" if density else ""
-                print(f" => {tree}{density_str}")
+                mode = 'unicode' # unicode latex html plain
+                print(f" => {pretty_formula(tree, mode)}{density_str}")
                 # print(f"    {getattr(tree, 'structure', None)}")
             except Exception as exc:
                 if bad:
diff --git a/periodictable/formulas.py b/periodictable/formulas.py
index adfe916..9e697ff 100644
--- a/periodictable/formulas.py
+++ b/periodictable/formulas.py
@@ -1073,19 +1073,6 @@ def _convert_to_hill_notation(atoms: dict[Atom, float]) -> Structure:
     """
     return tuple((atoms[el], el) for el in sorted(atoms.keys(), key=_hill_key))
 
-def _str_one_atom(fragment: Atom) -> str:
-    # Normal isotope string form is #-Yy, but we want Yy[#]
-    if isisotope(fragment) and 'symbol' not in fragment.__dict__:
-        ret = "%s[%d]"%(fragment.symbol, cast(Isotope, fragment).isotope)
-    else:
-        ret = fragment.symbol
-    if fragment.charge != 0:
-        sign = '+' if fragment.charge > 0 else '-'
-        value = str(abs(fragment.charge)) if abs(fragment.charge) > 1 else ''
-        ret += '{'+value+sign+'}'
-    return ret
-
-# TODO: add typing to _str_atoms
 def _str_atoms(seq) -> str:
     """
     Convert formula structure to string.
@@ -1094,7 +1081,7 @@ def _str_atoms(seq) -> str:
     ret = ""
     for count, fragment in seq:
         if isatom(fragment):
-            ret += _str_one_atom(fragment)
+            ret += str(fragment)
             if count != 1:
                 ret += "%g"%count
         else:
@@ -1113,7 +1100,7 @@ def from_subscript(value: str) -> str:
     Convert unicode subscript characters to normal characters. This allows us to parse,
     for example, H₂O as H2O.
     """
-    subscript_codepoints = {
+    codepoints = {
         '\u2080': '0', '\u2081': '1', '\u2082': '2', '\u2083': '3',
         '\u2084': '4', '\u2085': '5', '\u2086': '6', '\u2087': '7',
         '\u2088': '8', '\u2089': '9', '\u208a': '+', '\u208b': '-',
@@ -1124,11 +1111,26 @@ def from_subscript(value: str) -> str:
         '\u2098': 'm', '\u2099': 'n', '\u209a': 'p', '\u209b': 's',
         '\u209c': 't',
     }
-    return ''.join(subscript_codepoints.get(char, char) for char in str(value))
+    return ''.join(codepoints.get(char, char) for char in str(value))
+
+def from_superscript(value: str) -> str:
+    """
+    Convert unicode superscript characters to normal characters. This allows us to parse,
+    for example, Ca²⁺ as Ca{2+}.
+    """
+    codepoints = {
+        '\u2070': '0', '\u00B9': '1', '\u00B2': '2', '\u00B3': '3',
+        '\u2074': '4', '\u2075': '5', '\u2076': '6', '\u2077': '7',
+        '\u2078': '8', '\u2079': '9', '\u207a': '+', '\u207b': '-',
+        '\u207c': '=', '\u207d': '(', '\u207e': ')',
+
+        '\u2071': 'i', '\u207f': 'n',
+    }
+    return ''.join(codepoints.get(char, char) for char in str(value))
 
 def unicode_subscript(value: str) -> str:
     # Unicode subscript codepoints. Note that decimal point looks okay as subscript
-    subscript_codepoints = {
+    codepoints = {
         '0': '\u2080', '1': '\u2081', '2': '\u2082', '3': '\u2083',
         '4': '\u2084', '5': '\u2085', '6': '\u2086', '7': '\u2087',
         '8': '\u2088', '9': '\u2089', '+': '\u208a', '-': '\u208b',
@@ -1142,11 +1144,11 @@ def unicode_subscript(value: str) -> str:
         '\u2013': '\u208b', # en-dash is same as dash
         '\u2014': '\u208b', # em-dash is same as dash
     }
-    return ''.join(subscript_codepoints.get(char, char) for char in str(value))
+    return ''.join(codepoints.get(char, char) for char in str(value))
 
 def unicode_superscript(value: str) -> str:
     # Unicode subscript codepoints. Note that decimal point looks okay as subscript
-    superscript_codepoints = {
+    codepoints = {
         #'.': '\u00B0',  # degree symbol looks too much like zero
         #'.': ' \u02D9',  # dot above modifier looks okay in a floating string, but risky
         #'.': ' \u0307',  # space with dot above?
@@ -1162,7 +1164,7 @@ def unicode_superscript(value: str) -> str:
         '\u2013': '\u207b', # en-dash is same as dash
         '\u2014': '\u207b', # em-dash is same as dash
     }
-    return ''.join(superscript_codepoints.get(char, char) for char in str(value))
+    return ''.join(codepoints.get(char, char) for char in str(value))
 
 SUBSCRIPT: dict[str, Callable[[str], str]] = {
     # The latex renderer should work for github style markdown
@@ -1171,32 +1173,82 @@ def unicode_superscript(value: str) -> str:
     'unicode': unicode_subscript,
     'plain': lambda text: text
 }
-def pretty(compound: Formula, mode: str='unicode') -> str:
+SUPERSCRIPT: dict[str, Callable[[str], str]] = {
+    # The latex renderer should work for github style markdown
+    'latex': lambda text: f'$^{{{text}}}$',
+    'html': lambda text: f'<sup>{text}</sup>',
+    'unicode': unicode_superscript,
+    'plain': lambda text: text,
+}
+
+class PrettyFormula:
     """
-    Convert the formula to a string. The *mode* can be 'unicode', 'html' or
-    'latex' depending on how subscripts should be rendered. If *mode* is 'plain'
-    then don't use subscripts for the element quantities.
+    Formula pretty-printer.
 
-    Use *pretty(compound.hill)* for a more compact representation.
+    Formats formuls for output, using superscripts for isotope and valence and
+    subscripts for element counts.
+
+    *mode* is unicode, latex, html or plain for no special formatting.
     """
-    return _pretty(compound.structure, SUBSCRIPT[mode])
-
-# TODO: type hinting for _pretty
-def _pretty(structure, subscript: Callable[[str], str]) -> str:
-    # TODO: if superscript is not None then render O[16] as {}^{16}O
-    parts = []
-    for count, part in structure:
-        if isinstance(part, tuple):
-            if count == 1:
-                parts.append(_pretty(part, subscript))
-            else:
-                parts.append(f'({_pretty(part, subscript)}){subscript(count)}')
-        elif count == 1:
-            parts.append(f'{_str_one_atom(part)}')
+    mode: str
+    superscript: Callable[[str], str]
+    subscript: Callable[[str], str]
+
+    def __init__(self, mode):
+        self.mode = mode
+        self.subscript = SUBSCRIPT[mode]
+        self.superscript = SUPERSCRIPT[mode]
+
+    def walk_atom(self, atom):
+        if self.mode == 'plain':
+            return str(atom)
+        if ision(atom):
+            charge = '-' if atom.charge < 0 else '+'
+            magnitude = abs(atom.charge)
+            valence = charge*magnitude if magnitude < 2 else f"{magnitude}{charge}"
+            valence = self.superscript(valence)
+            atom = atom.element
+        else:
+            valence = ""
+        if isisotope(atom) and atom.symbol == atom.element.symbol:
+            isotope = self.superscript(str(atom.isotope))
         else:
-            parts.append(f'{_str_one_atom(part)}{subscript(count)}')
-    return ''.join(parts)
+            isotope = ""
+        return f"{isotope}{atom.symbol}{valence}"
+
+    def format(self, compound: Formula):
+        if self.mode == 'plain':
+            return str(compound)
+        return self.walk(compound.structure)
+
+    def walk(self, structure):
+        parts = []
+        for count, part in structure:
+            if isinstance(part, tuple):
+                if count == 1:
+                    parts.append(self.walk(part))
+                else:
+                    parts.append(f'({self.walk(part)}){self.subscript(count)}')
+            elif count == 1:
+                parts.append(self.walk_atom(part))
+            else:
+                parts.append(f'{self.walk_atom(part)}{self.subscript(count)}')
+        return ''.join(parts)
+
 
+def pretty(compound: Formula, mode: str='unicode') -> str:
+    """
+    Convert the formula to a string.
+
+    *mode* is unicode, html, latex, plain [default = unicode]
+
+    If *mode* is 'plain' then don't use superscipts and subscripts for rendering.
+
+    Use *pretty(compound.hill)* for a more compact representation.
+    """
+    if mode is None:
+        mode = 'unicode'
+    return PrettyFormula(mode).format(compound)
 
 def demo():
     import sys

From a30069e5b214746fa63398a2789e441c2251f551 Mon Sep 17 00:00:00 2001
From: Paul Kienzle <pkienzle@nist.gov>
Date: Fri, 6 Mar 2026 15:06:11 -0500
Subject: [PATCH 07/19] fix tests

---
 explore/lark_parse.py     |  1 +
 periodictable/core.py     |  2 +-
 periodictable/formulas.py | 41 ++++++++++++++++++++++++++-------------
 test/test_core.py         |  2 +-
 4 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/explore/lark_parse.py b/explore/lark_parse.py
index 40ae090..24c38e3 100644
--- a/explore/lark_parse.py
+++ b/explore/lark_parse.py
@@ -838,6 +838,7 @@ def check():
                 density = getattr(tree, 'density', None)
                 density_str = f" @ {density:.2f}" if density else ""
                 mode = 'unicode' # unicode latex html plain
+                # mode = 'plain'
                 print(f" => {pretty_formula(tree, mode)}{density_str}")
                 # print(f"    {getattr(tree, 'structure', None)}")
             except Exception as exc:
diff --git a/periodictable/core.py b/periodictable/core.py
index 9c8e496..fb095b3 100644
--- a/periodictable/core.py
+++ b/periodictable/core.py
@@ -398,7 +398,7 @@ def __init__(self, element_or_isotope: Element|Isotope):
     def __getitem__(self, charge: int) -> Ion:
         if charge not in self.ionset:
             if charge not in self.element_or_isotope.ions:
-                valence = f"{abs(charge)}{'+' if charge > 0 else '-'}"
+                valence = f"{abs(charge)}{'-' if charge < 0 else '+'}"
                 symbol = self.element_or_isotope.symbol
                 raise ValueError(f"valence {valence} is not valid for {symbol}")
             self.ionset[charge] = Ion(self.element_or_isotope, charge)
diff --git a/periodictable/formulas.py b/periodictable/formulas.py
index 9e697ff..2b118d7 100644
--- a/periodictable/formulas.py
+++ b/periodictable/formulas.py
@@ -675,7 +675,7 @@ def __rmul__(self, other):
         return ret
 
     def __str__(self):
-        return self.name if self.name else _str_atoms(self.structure)
+        return self.name if self.name else "".join(_str_atoms(self.structure))
 
     def __repr__(self):
         return "formula('%s')"%(str(self))
@@ -1073,24 +1073,41 @@ def _convert_to_hill_notation(atoms: dict[Atom, float]) -> Structure:
     """
     return tuple((atoms[el], el) for el in sorted(atoms.keys(), key=_hill_key))
 
-def _str_atoms(seq) -> str:
+def _str_one_atom(atom: Atom) -> str:
+    """
+    Format a single atom as SYMBOL[ISOTOPE]{VALENCE}.
+
+    Can't use str(atom) => ISOTOPE-SYMBOL{VALENCE} or repr(atom) => SYMBOL[ISOTOPE].ion[VALENCE]
+    """
+    valence = isotope = ""
+    if ision(atom):
+        ion = cast(Ion, atom)
+        charge = '-' if ion.charge < 0 else '+'
+        magnitude = abs(ion.charge)
+        valence = charge*magnitude if magnitude < 2 else f"{magnitude}{charge}"
+        valence = "{%s}"%valence
+        atom = ion.element
+    if isisotope(atom):
+        iso = cast(Isotope, atom)
+        if iso.symbol == iso.element.symbol:
+            isotope = f"[{iso.isotope}]"
+    return f"{atom.symbol}{isotope}{valence}"
+
+def _str_atoms(seq) -> list[str]:
     """
     Convert formula structure to string.
     """
     #print "str", seq
-    ret = ""
+    ret = []
     for count, fragment in seq:
         if isatom(fragment):
-            ret += str(fragment)
+            ret.append(_str_one_atom(fragment))
             if count != 1:
-                ret += "%g"%count
+                ret.append(f"{count:g}")
+        elif count == 1:
+            ret.extend(_str_atoms(fragment))
         else:
-            if count == 1:
-                piece = _str_atoms(fragment)
-            else:
-                piece = "(%s)%g"%(_str_atoms(fragment), count)
-            #ret = ret+" "+piece if ret else piece
-            ret += piece
+            ret.extend(("(", *_str_atoms(fragment), ")", f"{count:g}"))
 
     return ret
 
@@ -1200,8 +1217,6 @@ def __init__(self, mode):
         self.superscript = SUPERSCRIPT[mode]
 
     def walk_atom(self, atom):
-        if self.mode == 'plain':
-            return str(atom)
         if ision(atom):
             charge = '-' if atom.charge < 0 else '+'
             magnitude = abs(atom.charge)
diff --git a/test/test_core.py b/test/test_core.py
index d010a76..c86b9a0 100644
--- a/test/test_core.py
+++ b/test/test_core.py
@@ -64,7 +64,7 @@ def test():
         Fe.ion[-3]
         raise Exception("accepts invalid ions")
     except ValueError as msg:
-        assert str(msg) == "-3 is not a valid charge for Fe"
+        assert str(msg) == "valence 3- is not valid for Fe"
 
     assert data_files()[0][0] == "periodictable-data/xsf"
 

From f8cecea9072a6b060763a4621f962968c16c7b58 Mon Sep 17 00:00:00 2001
From: Paul Kienzle <pkienzle@nist.gov>
Date: Wed, 20 May 2026 20:01:38 -0400
Subject: [PATCH 08/19] add output type hints to ast transforms

---
 explore/lark_parse.py  | 135 ++++++++++++++++++++++-------------------
 periodictable/fasta.py |   8 +--
 2 files changed, 78 insertions(+), 65 deletions(-)

diff --git a/explore/lark_parse.py b/explore/lark_parse.py
index 24c38e3..ac412b2 100644
--- a/explore/lark_parse.py
+++ b/explore/lark_parse.py
@@ -1,10 +1,9 @@
 import lark
-import periodictable as pt
-from periodictable.core import PeriodicTable
+from periodictable.core import PeriodicTable, Element, Atom, Isotope
 from periodictable.core import default_table
 from periodictable.formulas import (
     from_subscript, from_superscript,
-    Formula,
+    Formula, Structure,
     _mix_by_weight_pairs, _mix_by_volume_pairs,
     VOLUME_UNITS, MASS_UNITS, LENGTH_UNITS,
     pretty as pretty_formula
@@ -89,13 +88,16 @@ def int_or_float(s):
     i = int(f)
     return i if i == f else f
 
-class StripJunk(lark.Transformer):
+class StripPunctuation(lark.Transformer):
     """
     Token stripper visitor class.
 
     This is done separately from the formula composer so that we can show the cleaned tree
     before debugging the conversion.
 
+    Unnamed punctuation characters []{}():% and units (kg, mL, nm, ...) which are represented
+    as quoted strings in the grammar have no associated token.
+
     Note: could get the same effect by renaming the unused terminals with leading underscore,
     but that makes the grammar harder to read.
     """
@@ -173,20 +175,20 @@ def COUNT(self, token: lark.Token) -> int|float:
         return it as an integer.
         """
         return int_or_float(from_subscript(token.value))
-    def SUPERINT(self, token):
+    def SUPERINT(self, token) -> int:
         """
         Return the integer value of a sequence of superscript digits.
 
         This is used to specify the valence or to specify the isotope.
         """
         return int(from_superscript(token.value))
-    def DENSITYMODE(self, token):
+    def DENSITYMODE(self, token) -> str:
         """
         Return the value of the DENSITYMODE token, either "n" or "i". If no mode is specified
         then a token value of None will be given to the density rule.
         """
         return token.value
-    def CHARGE(self, token):
+    def CHARGE(self, token) -> int:
         """
         Return a sequence of plus and minus characters. By grammar rules they must all have
         the same sign.
@@ -194,14 +196,14 @@ def CHARGE(self, token):
         This is used in the valence rule to specify the charge for the atom.
         """
         return token.value
-    def SUPERCHARGE(self, token):
+    def SUPERCHARGE(self, token) -> int:
         """
         Convert sequence of superscript plus and minus characters to ASCII plus and minus.
 
         This is used in the valence rule to specify the charge for the atom.
         """
         return from_superscript(token.value)
-    def SYMBOL(self, token):
+    def SYMBOL(self, token) -> Element:
         """
         Look up the element in the periodic table and return it.
 
@@ -211,32 +213,24 @@ def SYMBOL(self, token):
             return self._table.symbol(token.value)
         except Exception:
             raise ValueError(f"Element {token.value} doesn't exist")
-    def FASTA(self, token):
+    def FASTA(self, token) -> str:
         """
         Return the token value as the fasta sequence type: "dna", "rna" or "aa".
         """
         return token.value
-    def SEQUENCE(self, token):
+    def SEQUENCE(self, token) -> str:
         """
         Return the token value as the fasta sequence string.
         """
         return token.value
-    def fasta(self, tokens):
-        """
-        Return a fasta sequence and its type.
-
-        Transform: [type, sequence] => ('fasta', type, sequence)
-        """
-        stype, sequence = tokens
-        return 'fasta', stype, sequence
-    def isotope(self, tokens):
+    def isotope(self, tokens) -> int:
         """
         Return the isotope number for the atom.
 
         Transform: [isotope] => isotope
         """
         return tokens[0]
-    def valence(self, tokens):
+    def valence(self, tokens) -> int:
         """
         Return valence from number and sign.
 
@@ -267,7 +261,7 @@ def valence(self, tokens):
             raise ValueError(f"Use {value}{charge[0]} instead of {value}{charge} for valence")
         valence = value if charge[0] == '+' else -value
         return valence
-    def atom(self, tokens):
+    def atom(self, tokens) -> Atom:
         """
         Returns an atom from the periodic table.
 
@@ -297,7 +291,7 @@ def atom(self, tokens):
         #print(f"atom {tokens} => {atom}")
         return atom
 
-    def isoatom(self, tokens):
+    def isoatom(self, tokens) -> Atom:
         """
         Returns an isotope from the periodic table.
 
@@ -319,37 +313,46 @@ def isoatom(self, tokens):
         return atom
 
 
-    def group(self, tokens):
+    def group(self, tokens) -> Structure:
         """
         Returns a sequence of (count, item) pairs, where item is an atom or a nested formula.
         Missing counts default to 1.
 
         Transform: [atom|formula, count|None, ...] => ((count, atom|formula), ...)
+
+        Example CaCO3: [Ca, None, C, None, O, 3]
+        => ((1, Ca), (1, C), (3, O))
         """
+        # print("group tokens", tokens)
         tokens = [1 if value is None else value for value in tokens]
         pairs = tuple((count, item) for item, count in zip(tokens[::2], tokens[1::2]))
+        # print("group output", pairs)
         return pairs
 
-    def composite(self, tokens):
+    def composite(self, tokens) -> Structure:
         """
         Returns a sequence of (number, group) pairs. Each group is a sequence of (count, item)
         pairs, where item is an atom or a nested formula. Missing numbers default to 1.
 
         Transform: [number|None, group, ...] => ((number, group), ...) | ((count, atom), ...)
 
-        Example CaCO3 6H2O: None, ((1, Ca), (1, C), (3, O)), 6, ((2, H), (1, O))]
+        Example CaCO3 6H2O: [None, ((1, Ca), (1, C), (3, O)), 6, ((2, H), (1, O))]
         => ((1, ((1, Ca), (1, C), (3, O))), (6, ((2, H), (1, O))))
 
-        Example CaCO3(H20)6: [[None, ((1, Ca), (1, C), (3, O), (6, formula('H2O')))]
-        => ((1, Ca), (1, C), (3, O), (6, formula('H2O')))
+        Example CaCO3(H2O)6: [None, ((1, Ca), (1, C), (3, O), (6, formula('H2O')))]
+        => ((1, ((1, Ca), (1, C), (3, O), (6, formula('H2O')))),)
+
+        Example CaCO3 (H2O)6: [None, ((1, Ca), (1, C), (3, O)), None, ((6, formula('H2O')),)]
+        => ((1, ((1, Ca), (1, C), (3, O))), (1, ((6, formula('H2O')),)))
         """
-        # print("in composite", tokens)
+        # print("composite tokens", tokens)
         numbers = [1 if v is None else v for v in tokens[::2]]
         groups = tokens[1::2]
         pairs = tuple((number, group) for number, group in zip(numbers, groups))
+        # print("composite output", pairs)
         return pairs
 
-    def fasta(self, tokens):
+    def fasta(self, tokens) -> Structure:
         """
         Returns the formula corresponding to the FASTA sequence, with the natural
         density set. Labile hydrogen use H[1] in the formula.
@@ -359,23 +362,25 @@ def fasta(self, tokens):
 
         Transform: [ /aa|dna|rna/, /[A-Z -*]+/ ] => (1, ((1, formula),))
 
-        Example dna:CAGT: ['dna', 'CAGT'] => (1, ((1, C39H37H[1]10N15O25P4@1.69),))
+        Example dna:CAGT: ['dna', 'CAGT']
+        => ((1, ((1, formula('C39H37H[1]10N15O25P4')),)),)
         """
         # TODO: fasta is ignoring table when parsing
         # TODO: avoid circular imports
         # TODO: support other biochemicals (carbohydrate residues, lipids)
-        from periodictable import fasta
+        from periodictable.fasta import CODE_TABLES, Sequence
 
-        # print("in fasta", tokens)
+        # print("fasta input", tokens)
         seq_type, seq = tokens
-        if seq_type not in fasta.CODE_TABLES:
+        if seq_type not in CODE_TABLES:
             raise ValueError(f"Invalid fasta sequence type '{seq_type}:'")
-        seq = fasta.Sequence(name=None, sequence=seq, type=seq_type)
-        group = ((1, seq.labile_formula),)
-        composite = ((1, group),)
+        seq = Sequence(name=None, sequence=seq, type=seq_type)
+        pairs = ((1, seq.labile_formula),)
+        composite = ((1, pairs), )
+        # print("fasta output", composite)
         return composite
 
-    def density(self, tokens):
+    def density(self, tokens) -> tuple[str, float, str]:
         """
         Returns a density tuple from the @density construct. Density mode 'n' for
         natural or 'i' for isotopic defaults to isotopic. That is, D2O@1.11 is the
@@ -394,7 +399,7 @@ def density(self, tokens):
         mode = 'i' if not tokens[1] else tokens[1]
         return 'density', value, mode
 
-    def compound(self, tokens):
+    def compound(self, tokens) -> Formula:
         """
         Returns the formula for the compound, with optional density set.
 
@@ -411,15 +416,15 @@ def compound(self, tokens):
 
         Transform: [((number, group), ...), ('density', value, mode)|None] => formula
 
-        Example NaCl@2.16i: [(1, ((1, Na), (1, Cl))), ('density', 2.16, 'i')] => NaCl@2.16i
+        Example NaCl@2.16i: [((1, ((1, Na), (1, Cl))),), ('density', 2.16, 'i')] => NaCl@2.16i
 
-        Example dna:CAGT: [((1, ((1, C39H37H[1]10N15O25P4@1.69n),)),), None] => C39H37H[1]10N15O25P4@1.69n
+        Example dna:CAGT: [((1, ((1, formula('C39H37H[1]10N15O25P4')),)),), None] => C39H37H[1]10N15O25P4@1.69n
 
         Example CaCO3 6H2O: [((1, ((1, Ca), (1, C), (3, O))), (6, ((2, H), (1, O)))), None] => CaCO3(H2O)6
 
-        Example CaCO3(H20)6: [((1, ((1, Ca), (1, C), (3, O), (6, H2O@None))),), None] => CaCO3(H2O)6
+        Example CaCO3(H2O)6: [((1, ((1, Ca), (1, C), (3, O), (6, formula('H2O')))),), None] => CaCO3(H2O)6
         """
-        # print("in compound with", tokens)
+        # print("compound tokens", tokens)
         components, density_tuple = tokens
         if density_tuple is None:
             density, density_mode = None, 'i'
@@ -458,10 +463,10 @@ def expand_formula(group):
             else:
                 formula.density = density
 
-        # print(f"compound = {formula} @ {formula.density}")
+        # print(f"compound output {formula} @ {formula.density}")
         return formula
 
-    def weightpct(self, tokens):
+    def weightpct(self, tokens) -> float:
         """
         Returns the percentage. The value has already be converted to a number.
 
@@ -473,7 +478,7 @@ def weightpct(self, tokens):
         """
         return tokens[0]
 
-    def volumepct(self, tokens):
+    def volumepct(self, tokens) -> float:
         """
         Returns the percentage. The value has already be converted to a number.
 
@@ -485,7 +490,7 @@ def volumepct(self, tokens):
         """
         return tokens[0]
 
-    def percentage(self, tokens):
+    def percentage(self, tokens) -> float:
         """
         Returns the percentage. The value has already be converted to a number.
 
@@ -495,7 +500,7 @@ def percentage(self, tokens):
         """
         return tokens[0]
 
-    def byweight(self, tokens):
+    def byweight(self, tokens) -> Formula:
         """
         Returns mixture by wt% of the various components in the system.
 
@@ -516,7 +521,7 @@ def byweight(self, tokens):
         # print(f"byweight => {formula} @ {formula.density}")
         return formula
 
-    def byvolume(self, tokens):
+    def byvolume(self, tokens) -> Formula:
         """
         Returns mixture by vol% of the various components in the system. Volumes are converted
         to mass using density.
@@ -540,7 +545,7 @@ def byvolume(self, tokens):
         formula = _mix_by_volume_pairs(pairs)
         return formula
 
-    def byamount(self, tokens):
+    def byamount(self, tokens) -> Formula:
         """
         Returns mixture by mass of the various components in the system. Volumes are converted
         to mass using density.
@@ -568,7 +573,7 @@ def find_value(quantity, formula):
         formula.total_mass = total
         return formula
 
-    def layers(self, tokens):
+    def layers(self, tokens) -> Formula:
         """
         Returns the mixture by volume of the various layers in the system.
 
@@ -590,7 +595,7 @@ def layers(self, tokens):
         formula.thickness = total
         return formula
 
-    def mixture(self, tokens):
+    def mixture(self, tokens) -> Formula:
         """
         Returns the formula representing the mixture, either byweight, byvolume, byamount or layers
 
@@ -598,7 +603,7 @@ def mixture(self, tokens):
         """
         return tokens[0]
 
-    def formula(self, tokens):
+    def formula(self, tokens) -> Formula:
         """
         Return the formula representing the compound or mixture.
 
@@ -606,7 +611,7 @@ def formula(self, tokens):
         """
         return tokens[0]
 
-    def thickness(self, tokens):
+    def thickness(self, tokens) -> tuple[str, float, str]:
         """
         Returns (dimension, value, unit) with dimension equal 'length'
 
@@ -617,7 +622,7 @@ def thickness(self, tokens):
         value, (dim, units) = tokens
         return dim, value, units
 
-    def quantity(self, tokens):
+    def quantity(self, tokens) -> tuple[str, float, str]:
         """
         Returns (dimension, value, unit) with dimension equal 'mass' or 'volume'
 
@@ -628,7 +633,7 @@ def quantity(self, tokens):
         value, (dim, units) = tokens
         return dim, value, units
 
-    def start(self, tokens):
+    def start(self, tokens) -> Formula:
         """
         Return the final formula, with the original text attached.
 
@@ -708,7 +713,7 @@ def parse_formula(formula_str: str, table: PeriodicTable|None=None) -> Formula:
     Parse a chemical formula, returning a structure with elements from the
     given periodic table.
     """
-    cleanup = StripJunk()
+    cleanup = StripPunctuation()
     convert = ConvertTokens(formula_str, table=table)
     try:
         tree = formula_parser.parse(formula_str)
@@ -821,6 +826,8 @@ def parse_formula(formula_str: str, table: PeriodicTable|None=None) -> Formula:
 """
 
 def check():
+    from periodictable.formulas import parse_formula as old_parser
+
     for line in examples.split('\n'):
         formula = line.split('#')[0]
         bad = line.startswith('!')
@@ -833,8 +840,8 @@ def check():
                 print(f"*** {line}")
             try:
                 # Toggle the following to test pyparsing vs lark
-                tree = parse_formula(formula)
-                #tree = pt.formula(formula) if "##" not in line else "!!! pyparsing fails"
+                #tree = parse_formula(formula)
+                tree = old_parser(formula) if "##" not in line else "!!! pyparsing fails"
                 density = getattr(tree, 'density', None)
                 density_str = f" @ {density:.2f}" if density else ""
                 mode = 'unicode' # unicode latex html plain
@@ -847,9 +854,15 @@ def check():
                 else:
                     raise exc from None
             else:
-                if '##' in line: continue  # pyparsing should fail but doesn't
+                if '##' in line:
+                    continue  # pyparsing should fail but doesn't
                 if bad:
                     raise RuntimeError(f"Exception not raised for <{formula}>")
 
 if __name__ == "__main__":
-    check()
\ No newline at end of file
+    import sys
+    if len(sys.argv) > 1:
+        for arg in sys.argv[1:]:
+            print(parse_formula(arg))
+    else:
+        check()
\ No newline at end of file
diff --git a/periodictable/fasta.py b/periodictable/fasta.py
index b378477..f95d9ed 100644
--- a/periodictable/fasta.py
+++ b/periodictable/fasta.py
@@ -75,7 +75,7 @@
 from collections.abc import Iterator
 from typing import IO, cast
 
-from .formulas import formula as parse_formula, Formula, FormulaInput
+from .formulas import formula as make_formula, Formula, FormulaInput
 from .nsf import neutron_sld
 from .xsf import xray_sld
 from .core import default_table, Atom
@@ -177,7 +177,7 @@ def __init__(
         elements = default_table()
 
         # Fill in density or cell_volume.
-        M = parse_formula(formula, natural_density=density)
+        M = make_formula(formula, natural_density=density)
         # CRUFT: use of T rather than H[1] is deprecated since 1.5.3
         if elements.T in M.atoms:
             warnings.warn("Use of tritium for labile hydrogen is deprecated."
@@ -274,7 +274,7 @@ def __init__(self, name: str, sequence: str, type: str='aa'):
             structure.extend(list(p.labile_formula.structure))
         # Add H + OH terminators to the sequence
         structure.extend(((2, elements.H[1]), (1, elements.O)))
-        formula = parse_formula(structure).hill
+        formula = make_formula(structure).hill
 
         Molecule.__init__(
             self, name, formula, cell_volume=cell_volume, charge=charge)
@@ -356,7 +356,7 @@ def _code_average(bases, code_table) -> tuple[Formula, float, float]:
     Note: averaging can lead to a fractional charge on the returned molecule.
     """
     n = len(bases)
-    formula, cell_volume, charge = parse_formula(), 0., 0.
+    formula, cell_volume, charge = make_formula(), 0., 0.
     for c in bases:
         base = code_table[c]
         formula += base.labile_formula

From 3412542bc6a244b72b4490d9115a5a91ab0de327 Mon Sep 17 00:00:00 2001
From: Paul Kienzle <pkienzle@nist.gov>
Date: Wed, 20 May 2026 20:22:20 -0400
Subject: [PATCH 09/19] adjust imports; improve :SEQ error message

---
 {explore => periodictable}/lark_parse.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)
 rename {explore => periodictable}/lark_parse.py (99%)

diff --git a/explore/lark_parse.py b/periodictable/lark_parse.py
similarity index 99%
rename from explore/lark_parse.py
rename to periodictable/lark_parse.py
index ac412b2..d5479d4 100644
--- a/explore/lark_parse.py
+++ b/periodictable/lark_parse.py
@@ -1,7 +1,7 @@
 import lark
-from periodictable.core import PeriodicTable, Element, Atom, Isotope
-from periodictable.core import default_table
-from periodictable.formulas import (
+from .core import PeriodicTable, Element, Atom, Isotope
+from .core import default_table
+from .formulas import (
     from_subscript, from_superscript,
     Formula, Structure,
     _mix_by_weight_pairs, _mix_by_volume_pairs,
@@ -663,7 +663,7 @@ def _allowed(allowed):
         NUMBER="NUMBER", # start of compound or start of mixture
         #FASTA="[dna|rna|aa]:SEQ",
         FASTA="aa:SEQ",
-        COLON=":",
+        COLON=":SEQ",
         #COLON="aa:SEQ",
         SEQUENCE="aa:SEQ",
         SEPARATOR="+", # generic group separator in composite
@@ -840,8 +840,8 @@ def check():
                 print(f"*** {line}")
             try:
                 # Toggle the following to test pyparsing vs lark
-                #tree = parse_formula(formula)
-                tree = old_parser(formula) if "##" not in line else "!!! pyparsing fails"
+                tree = parse_formula(formula)
+                #tree = old_parser(formula) if "##" not in line else "!!! pyparsing fails"
                 density = getattr(tree, 'density', None)
                 density_str = f" @ {density:.2f}" if density else ""
                 mode = 'unicode' # unicode latex html plain
@@ -859,10 +859,14 @@ def check():
                 if bad:
                     raise RuntimeError(f"Exception not raised for <{formula}>")
 
-if __name__ == "__main__":
+def main():
     import sys
+
     if len(sys.argv) > 1:
         for arg in sys.argv[1:]:
             print(parse_formula(arg))
     else:
-        check()
\ No newline at end of file
+        check()
+
+if __name__ == "__main__":
+    main()

From a1579af92edf624517ff999ef78c7a4a188fa0ba Mon Sep 17 00:00:00 2001
From: Paul Kienzle <pkienzle@nist.gov>
Date: Wed, 20 May 2026 20:56:38 -0400
Subject: [PATCH 10/19] move unicode superscript/subscript conversion to util;
 clean up type hints

---
 periodictable/formulas.py   | 84 ++++---------------------------------
 periodictable/lark_parse.py | 14 ++++---
 periodictable/util.py       | 72 +++++++++++++++++++++++++++++++
 3 files changed, 89 insertions(+), 81 deletions(-)

diff --git a/periodictable/formulas.py b/periodictable/formulas.py
index 2b118d7..0dc2fc1 100644
--- a/periodictable/formulas.py
+++ b/periodictable/formulas.py
@@ -6,7 +6,7 @@
 
 from copy import copy
 from math import pi, sqrt
-from typing import cast, Union, Any
+from typing import cast, Union, Any, Iterable
 from collections.abc import Sequence, Callable
 
 # Requires that the pyparsing module is installed.
@@ -15,9 +15,9 @@
                        ZeroOrMore, OneOrMore, Forward, StringEnd, Group)
 
 from .core import default_table, isatom, isisotope, ision, change_table
-from .core import Atom, Element, Isotope, Ion, PeriodicTable # for typing
+from .core import Atom, Isotope, Ion, PeriodicTable # for typing
 from .constants import avogadro_number, electron_mass
-from .util import cell_volume
+from .util import cell_volume, unicode_subscript, unicode_superscript
 
 FormulaInput = Union[str, "Formula", Atom, dict[Atom, float], Sequence[tuple[float, Any]], None]
 Fragment = tuple[float, Union[Atom, "Structure"]]
@@ -89,7 +89,7 @@ def mix_by_weight(*args, **kw) -> "Formula":
         result.name = name
     return result
 
-def _mix_by_weight_pairs(pairs: list[tuple["Formula", float]]) -> "Formula":
+def _mix_by_weight_pairs(pairs: Iterable[tuple["Formula", float]]) -> "Formula":
     from .formulas import Formula # For running as __main__
 
     # Drop pairs with zero quantity
@@ -175,7 +175,7 @@ def mix_by_volume(*args, **kw) -> "Formula":
         result.name = name
     return result
 
-def _mix_by_volume_pairs(pairs: list[tuple["Formula", float]]) -> "Formula":
+def _mix_by_volume_pairs(pairs: Iterable[tuple["Formula", float]]) -> "Formula":
     from .formulas import Formula # For running as __main__
 
     # Drop pairs with zero quantity
@@ -330,6 +330,8 @@ class Formula:
     structure: Structure
     density: float|None
     name: str|None
+    total_mass: float|None = None
+    thickness: float|None = None
 
     def __init__(self,
             structure: Structure=tuple(),
@@ -738,6 +740,7 @@ def formula_grammar(table: PeriodicTable) -> ParserElement:
     # This ickiness is because the formula class returned from the circular
     # import of fasta does not match the local formula class.
     from .formulas import Formula
+    from .util import from_subscript, from_superscript
 
     # Recursive
     composite = Forward()
@@ -1112,77 +1115,6 @@ def _str_atoms(seq) -> list[str]:
     return ret
 
 
-def from_subscript(value: str) -> str:
-    """
-    Convert unicode subscript characters to normal characters. This allows us to parse,
-    for example, H₂O as H2O.
-    """
-    codepoints = {
-        '\u2080': '0', '\u2081': '1', '\u2082': '2', '\u2083': '3',
-        '\u2084': '4', '\u2085': '5', '\u2086': '6', '\u2087': '7',
-        '\u2088': '8', '\u2089': '9', '\u208a': '+', '\u208b': '-',
-        '\u208c': '=', '\u208d': '(', '\u208e': ')',
-
-        '\u2090': 'a', '\u2091': 'e', '\u2092': 'o', '\u2093': 'x',
-        '\u2095': 'h', '\u2096': 'k', '\u2097': 'l',
-        '\u2098': 'm', '\u2099': 'n', '\u209a': 'p', '\u209b': 's',
-        '\u209c': 't',
-    }
-    return ''.join(codepoints.get(char, char) for char in str(value))
-
-def from_superscript(value: str) -> str:
-    """
-    Convert unicode superscript characters to normal characters. This allows us to parse,
-    for example, Ca²⁺ as Ca{2+}.
-    """
-    codepoints = {
-        '\u2070': '0', '\u00B9': '1', '\u00B2': '2', '\u00B3': '3',
-        '\u2074': '4', '\u2075': '5', '\u2076': '6', '\u2077': '7',
-        '\u2078': '8', '\u2079': '9', '\u207a': '+', '\u207b': '-',
-        '\u207c': '=', '\u207d': '(', '\u207e': ')',
-
-        '\u2071': 'i', '\u207f': 'n',
-    }
-    return ''.join(codepoints.get(char, char) for char in str(value))
-
-def unicode_subscript(value: str) -> str:
-    # Unicode subscript codepoints. Note that decimal point looks okay as subscript
-    codepoints = {
-        '0': '\u2080', '1': '\u2081', '2': '\u2082', '3': '\u2083',
-        '4': '\u2084', '5': '\u2085', '6': '\u2086', '7': '\u2087',
-        '8': '\u2088', '9': '\u2089', '+': '\u208a', '-': '\u208b',
-        '=': '\u208c', '(': '\u208d', ')': '\u208e',
-
-        'a': '\u2090', 'e': '\u2091', 'o': '\u2092', 'x': '\u2093',
-        'h': '\u2095', 'k': '\u2096', 'l': '\u2097',
-        'm': '\u2098', 'n': '\u2099', 'p': '\u209a', 's': '\u209b',
-        't': '\u209c',
-
-        '\u2013': '\u208b', # en-dash is same as dash
-        '\u2014': '\u208b', # em-dash is same as dash
-    }
-    return ''.join(codepoints.get(char, char) for char in str(value))
-
-def unicode_superscript(value: str) -> str:
-    # Unicode subscript codepoints. Note that decimal point looks okay as subscript
-    codepoints = {
-        #'.': '\u00B0',  # degree symbol looks too much like zero
-        #'.': ' \u02D9',  # dot above modifier looks okay in a floating string, but risky
-        #'.': ' \u0307',  # space with dot above?
-        #'.': '\u22C5', # math dot operator
-        '.': '\u1427',  # Canadian aboriginal extended block dot (looks good on mac)
-        '2': '\u00B2', '3': '\u00B3',
-        '1': '\u00B9',
-        '0': '\u2070', 'i': '\u2071',
-        '4': '\u2074', '5': '\u2075', '6': '\u2076', '7': '\u2077',
-        '9': '\u2078', '0': '\u2079', '+': '\u207a', '-': '\u207b',
-        '=': '\u207c', '(': '\u207d', ')': '\u207e', 'n': '\u207f',
-
-        '\u2013': '\u207b', # en-dash is same as dash
-        '\u2014': '\u207b', # em-dash is same as dash
-    }
-    return ''.join(codepoints.get(char, char) for char in str(value))
-
 SUBSCRIPT: dict[str, Callable[[str], str]] = {
     # The latex renderer should work for github style markdown
     'latex': lambda text: f'$_{{{text}}}$',
diff --git a/periodictable/lark_parse.py b/periodictable/lark_parse.py
index d5479d4..581bb5c 100644
--- a/periodictable/lark_parse.py
+++ b/periodictable/lark_parse.py
@@ -1,13 +1,16 @@
+from typing import cast
+
 import lark
+
 from .core import PeriodicTable, Element, Atom, Isotope
 from .core import default_table
 from .formulas import (
-    from_subscript, from_superscript,
     Formula, Structure,
     _mix_by_weight_pairs, _mix_by_volume_pairs,
     VOLUME_UNITS, MASS_UNITS, LENGTH_UNITS,
     pretty as pretty_formula
 )
+from .util import from_subscript, from_superscript
 
 # TODO: valence belongs to a group rather than element
 
@@ -196,7 +199,7 @@ def CHARGE(self, token) -> int:
         This is used in the valence rule to specify the charge for the atom.
         """
         return token.value
-    def SUPERCHARGE(self, token) -> int:
+    def SUPERCHARGE(self, token) -> str:
         """
         Convert sequence of superscript plus and minus characters to ASCII plus and minus.
 
@@ -374,11 +377,12 @@ def fasta(self, tokens) -> Structure:
         seq_type, seq = tokens
         if seq_type not in CODE_TABLES:
             raise ValueError(f"Invalid fasta sequence type '{seq_type}:'")
-        seq = Sequence(name=None, sequence=seq, type=seq_type)
+        seq = Sequence(name="seq", sequence=seq, type=seq_type)
         pairs = ((1, seq.labile_formula),)
-        composite = ((1, pairs), )
+        composite = ((1, pairs),)
         # print("fasta output", composite)
-        return composite
+        # return tuple[tuple[int, tuple[tuple[int, Formula]]]] as Structure
+        return cast(Structure, composite)
 
     def density(self, tokens) -> tuple[str, float, str]:
         """
diff --git a/periodictable/util.py b/periodictable/util.py
index d7fa8ec..0b7267c 100644
--- a/periodictable/util.py
+++ b/periodictable/util.py
@@ -53,6 +53,78 @@ def parse_uncertainty(s: str) -> tuple[float, float]|tuple[None, None]:
     # Plain value with no uncertainty
     return float(s), 0
 
+def from_subscript(value: str) -> str:
+    """
+    Convert unicode subscript characters to normal characters. This allows us to parse,
+    for example, H₂O as H2O.
+    """
+    codepoints = {
+        '\u2080': '0', '\u2081': '1', '\u2082': '2', '\u2083': '3',
+        '\u2084': '4', '\u2085': '5', '\u2086': '6', '\u2087': '7',
+        '\u2088': '8', '\u2089': '9', '\u208a': '+', '\u208b': '-',
+        '\u208c': '=', '\u208d': '(', '\u208e': ')',
+
+        '\u2090': 'a', '\u2091': 'e', '\u2092': 'o', '\u2093': 'x',
+        '\u2095': 'h', '\u2096': 'k', '\u2097': 'l',
+        '\u2098': 'm', '\u2099': 'n', '\u209a': 'p', '\u209b': 's',
+        '\u209c': 't',
+    }
+    return ''.join(codepoints.get(char, char) for char in str(value))
+
+def from_superscript(value: str) -> str:
+    """
+    Convert unicode superscript characters to normal characters. This allows us to parse,
+    for example, Ca²⁺ as Ca{2+}.
+    """
+    codepoints = {
+        '\u2070': '0', '\u00B9': '1', '\u00B2': '2', '\u00B3': '3',
+        '\u2074': '4', '\u2075': '5', '\u2076': '6', '\u2077': '7',
+        '\u2078': '8', '\u2079': '9', '\u207a': '+', '\u207b': '-',
+        '\u207c': '=', '\u207d': '(', '\u207e': ')',
+
+        '\u2071': 'i', '\u207f': 'n',
+    }
+    return ''.join(codepoints.get(char, char) for char in str(value))
+
+def unicode_subscript(value: str) -> str:
+    # Unicode subscript codepoints. Note that decimal point looks okay as subscript
+    codepoints = {
+        '0': '\u2080', '1': '\u2081', '2': '\u2082', '3': '\u2083',
+        '4': '\u2084', '5': '\u2085', '6': '\u2086', '7': '\u2087',
+        '8': '\u2088', '9': '\u2089', '+': '\u208a', '-': '\u208b',
+        '=': '\u208c', '(': '\u208d', ')': '\u208e',
+
+        'a': '\u2090', 'e': '\u2091', 'o': '\u2092', 'x': '\u2093',
+        'h': '\u2095', 'k': '\u2096', 'l': '\u2097',
+        'm': '\u2098', 'n': '\u2099', 'p': '\u209a', 's': '\u209b',
+        't': '\u209c',
+
+        '\u2013': '\u208b', # en-dash is same as dash
+        '\u2014': '\u208b', # em-dash is same as dash
+    }
+    return ''.join(codepoints.get(char, char) for char in str(value))
+
+def unicode_superscript(value: str) -> str:
+    # Unicode subscript codepoints. Note that decimal point looks okay as subscript
+    codepoints = {
+        #'.': '\u00B0',  # degree symbol looks too much like zero
+        #'.': ' \u02D9',  # dot above modifier looks okay in a floating string, but risky
+        #'.': ' \u0307',  # space with dot above?
+        #'.': '\u22C5', # math dot operator
+        '.': '\u1427',  # Canadian aboriginal extended block dot (looks good on mac)
+        '2': '\u00B2', '3': '\u00B3',
+        '1': '\u00B9',
+        '0': '\u2070', 'i': '\u2071',
+        '4': '\u2074', '5': '\u2075', '6': '\u2076', '7': '\u2077',
+        '9': '\u2078', '0': '\u2079', '+': '\u207a', '-': '\u207b',
+        '=': '\u207c', '(': '\u207d', ')': '\u207e', 'n': '\u207f',
+
+        '\u2013': '\u207b', # en-dash is same as dash
+        '\u2014': '\u207b', # em-dash is same as dash
+    }
+    return ''.join(codepoints.get(char, char) for char in str(value))
+
+
 def cell_volume(a=None, b=None, c=None, alpha=None, beta=None, gamma=None) -> float:
     r"""
     Compute cell volume from lattice parameters.

From 0efb66629514b0b3f6f0c9b2eac21fed93e397bd Mon Sep 17 00:00:00 2001
From: Paul Kienzle <pkienzle@nist.gov>
Date: Thu, 21 May 2026 17:14:23 -0400
Subject: [PATCH 11/19] fix tests and doc build

---
 doc/sphinx/conf.py                   |  7 ++-
 doc/sphinx/genmods.py                |  1 +
 doc/sphinx/guide/formula_grammar.rst | 66 ++++++++++++++++---------
 periodictable/formulas.py            | 58 ++++++++++++++--------
 periodictable/lark_parse.py          | 73 +++++++++++++++-------------
 pyproject.toml                       |  5 +-
 test/test_formulas.py                | 24 ++++++---
 7 files changed, 148 insertions(+), 86 deletions(-)

diff --git a/doc/sphinx/conf.py b/doc/sphinx/conf.py
index dad6c9e..d65594d 100644
--- a/doc/sphinx/conf.py
+++ b/doc/sphinx/conf.py
@@ -27,6 +27,7 @@
 sys.path.insert(0, os.path.abspath('../..'))
 sys.path.insert(0, os.path.abspath('_extensions'))
 import periodictable
+from periodictable.lark_parse import grammar
 
 
 # -- General configuration -----------------------------------------------------
@@ -65,6 +66,11 @@
     ('py:class', 'numpy._typing._array_like._ScalarT'),
     ('py:class', 'numpy._typing._nested_sequence._NestedSequence'),
     ('py:class', 'pyparsing.core.ParserElement'),
+    ('py:class', 'lark.tree.Tree'),
+    ('py:class', 'lark.lexer.Token'),
+    ('py:class', 'lark.visitors.Transformer'),
+    ('py:class', 'lark.visitors._Leaf_T'),
+    ('py:class', 'lark.visitors._Return_T'),
 
     ('py:class', 'periodictable.core._AtomBase'),
     ('py:class', 'periodictable.core.IonSet'),
@@ -300,4 +306,3 @@
 if os.path.exists('rst_prolog'):
     with io.open('rst_prolog', encoding='utf-8') as fid:
         rst_prolog = fid.read()
-
diff --git a/doc/sphinx/genmods.py b/doc/sphinx/genmods.py
index 9cdc46a..4b930e5 100644
--- a/doc/sphinx/genmods.py
+++ b/doc/sphinx/genmods.py
@@ -55,6 +55,7 @@ def genfiles(package, package_name, modules, dir='api'):
     #('__init__', 'Top level namespace'),
     ('core', 'Core table'),
     ('formulas', 'Chemical formula operations'),
+    ('lark_parse', 'Chemical formula parser'),
     ('covalent_radius', 'Covalent radius'),
     ('constants', 'Fundamental constants'),
     ('crystal_structure', 'Crystal structure'),
diff --git a/doc/sphinx/guide/formula_grammar.rst b/doc/sphinx/guide/formula_grammar.rst
index db1ab2c..da694cd 100644
--- a/doc/sphinx/guide/formula_grammar.rst
+++ b/doc/sphinx/guide/formula_grammar.rst
@@ -159,28 +159,50 @@ The grammar used for parsing formula strings is the following:
 
 ::
 
-    formula    :: compound | mixture | nothing
-    mixture    :: quantity | percentage
-    quantity   :: number unit part ('//' number unit part)*
-    percentage :: number 'wt%|vol%' part ('//' number '%' part)* '//' part
-    part       :: compound | '(' mixture ')'
-    compound   :: (composite | fasta) density?
-    fasta      :: ('dna' | 'rna' | 'aa') ':' [A-Z -*]+
-    composite  :: group (separator group)*
-    group      :: number element+ | '(' formula ')' number
-    element    :: symbol isotope? ion? number?
-    symbol     :: [A-Z][a-z]*
-    isotope    :: '[' integer ']'
-    ion        :: '{' integer? [+-] '}'
-    density    :: '@' number [ni]?
-    number     :: integer | fraction
-    integer    :: [1-9][0-9]*
-    fraction   :: ([1-9][0-9]* | 0)? '.' [0-9]*
-    separator  :: space? '+'? space?
-    unit       :: mass | volume | length
-    mass       :: 'kg' | 'g' | 'mg' | 'ug' | 'ng'
-    volume     :: 'L' | 'mL' | 'uL' | 'nL'
-    length     :: 'cm' | 'mm' | 'um' | 'nm'
+    formula    : compound | mixture
+
+    # Mixture definitions:  quantity compound // quantity compound // quantity compound
+    mixture    : byamount | byvolume | byweight | layers
+    byamount   : quantity compound (MIX quantity compound)*
+    byvolume   : volumepct compound (MIX percentage compound)* MIX compound
+    byweight   : weightpct compound (MIX percentage compound)* MIX compound
+    layers     : thickness compound (MIX thickness compound)*
+    quantity   : NUMBER SPACE? (MASS | VOLUME) SPACE
+    weightpct  : NUMBER SPACE? WEIGHTPCT SPACE
+    volumepct  : NUMBER SPACE? VOLUMEPCT SPACE
+    thickness  : NUMBER SPACE? LENGTH SPACE
+    percentage : NUMBER SPACE? "%" SPACE  # Allows "3 % "
+
+    # Compound definition: number group ... @density where group is El count El count ...
+    # FASTA sequences: (rna|dna|aa) : SEQUENCE @ density
+    # Density applies to the entire formula, such as "NaCl + 29.2H2O @ 1.07n"
+    # For the density of a mixture you need parentheses: "(10 wt% NaCl // H2O)@1.07n"
+    compound   : (composite | fasta) [density]
+    fasta      : FASTA ":" SEQUENCE
+    composite  : [NUMBER] group (SEPARATOR [NUMBER] group)*
+    group      : ((atom | isoatom | "(" formula ")") [COUNT])+
+    atom       : SYMBOL [isotope] [valence]
+    isoatom    : SUPERINT SYMBOL [valence]    # For example ²H for deuterium
+    isotope    : "[" INTEGER "]"
+    valence    : "{" [INTEGER] CHARGE "}" | [SUPERINT] SUPERCHARGE
+    density    : SPACE? "@" SPACE? DENSITY [DENSITYMODE]
+
+    # Tokens
+    #FASTA     : /dna|rna|aa/  # Sequence type is limited to these values but ...
+    FASTA      : /[a-z]+/      # "type:sequence" syntax allows better error reporting
+    SEQUENCE   : /[-A-Z *]+/
+    # could list all elements, but better error reporting if element symbol lookup fails
+    SYMBOL     : /[A-Z][a-z]*/
+    CHARGE     : /[+]+|[-]+/  # allow valence using {++} or {--}
+    DENSITY    : NUMBER  # using alias DENSITY for number for better error reporting
+    DENSITYMODE: /[ni]/       # n=natural density, i=isotopic density
+    MIX        : SPACE? "//" SPACE?
+    WEIGHTPCT  : /%w((eigh)?t)?/ | /w((eigh)?t)?%/ | /%m(ass)?/ | /m(ass)?%/
+    VOLUMEPCT  : /%v(ol(ume)?)?/ | /v(ol(ume)?)?%/
+    MASS       : "kg" | "g" | "mg" | "ug" | "μg" | "ng"
+    VOLUME     : "L" | "mL" | "uL" | "μL" | "nL"
+    LENGTH     : "cm" | "mm" | "um" | "μm" | "nm" | "Ang" | "Å"
+    COUNT      : NUMBER | SUBNUM  # atom counts can be normal numbers or unicode subscripts
 
 Formulas can also be constructed from atoms or other formulas:
 
diff --git a/periodictable/formulas.py b/periodictable/formulas.py
index 0dc2fc1..e752e3e 100644
--- a/periodictable/formulas.py
+++ b/periodictable/formulas.py
@@ -6,14 +6,9 @@
 
 from copy import copy
 from math import pi, sqrt
-from typing import cast, Union, Any, Iterable
+from typing import cast, Union, Any, Iterable, TYPE_CHECKING
 from collections.abc import Sequence, Callable
 
-# Requires that the pyparsing module is installed.
-
-from pyparsing import (ParserElement, Literal, Optional, White, Regex,
-                       ZeroOrMore, OneOrMore, Forward, StringEnd, Group)
-
 from .core import default_table, isatom, isisotope, ision, change_table
 from .core import Atom, Isotope, Ion, PeriodicTable # for typing
 from .constants import avogadro_number, electron_mass
@@ -227,7 +222,7 @@ def formula(
             change in cell volume.
 
         *name* : string
-            Common name for the molecule.
+            Common name for the material.
 
         *table* : PeriodicTable
             Private table to use when parsing string formulas.
@@ -288,6 +283,7 @@ def formula(
     display purposes.
     """
     from .formulas import Formula # For running as __main__
+    from .lark_parse import parse_formula
 
     structure: Structure
     if compound is None or compound == '':
@@ -328,10 +324,25 @@ class Formula:
     Simple chemical formula representation.
     """
     structure: Structure
+    """Nested structure ((count, atom|structure), ...)"""
     density: float|None
+    """
+    |g/cm^3|
+
+    Density of the material.
+    """
     name: str|None
+    """
+    Name of the material. Default is the input string for the formula parser.
+    """
     total_mass: float|None = None
+    """
+    For mixture by mass, the total mass of the mixture (g).
+    """
     thickness: float|None = None
+    """
+    For mixture by layer, the total thickness of the mixture (cm).
+    """
 
     def __init__(self,
             structure: Structure=tuple(),
@@ -413,7 +424,7 @@ def natural_density(self) -> float | None:
         """
         |g/cm^3|
 
-        Density of the formula with specific isotopes of each element
+        Density of the material with specific isotopes of each element
         replaced by the naturally occurring abundance of the element
         without changing the cell volume.
         """
@@ -677,7 +688,8 @@ def __rmul__(self, other):
         return ret
 
     def __str__(self):
-        return self.name if self.name else "".join(_str_atoms(self.structure))
+        # return self.name if self.name else "".join(_str_atoms(self.structure))
+        return "".join(_str_atoms(self.structure))
 
     def __repr__(self):
         return "formula('%s')"%(str(self))
@@ -711,15 +723,12 @@ def _isotope_substitution(compound: "Formula", source: Atom, target: Atom, porti
         density = compound.density
     return formula(atoms, density=density)
 
+if TYPE_CHECKING:
+    from pyparsing import ParserElement
 
 # TODO: Grammar should be independent of table
-# TODO: Parser can't handle meters as 'm' because it conflicts with the milli prefix
-LENGTH_UNITS = {'nm': 1e-9, 'um': 1e-6, 'μm': 1e-6, 'mm': 1e-3, 'cm': 1e-2, 'Ang': 1e-10, 'Å': 1e-10}
-MASS_UNITS = {'ng': 1e-9, 'ug': 1e-6, 'mg': 1e-3, 'g': 1e+0, 'kg': 1e+3}
-VOLUME_UNITS = {'nL': 1e-9, 'uL': 1e-6, 'mL': 1e-3, 'L': 1e+0}
-LENGTH_RE = '('+'|'.join(LENGTH_UNITS.keys())+')'
-MASS_VOLUME_RE = '('+'|'.join(list(MASS_UNITS.keys())+list(VOLUME_UNITS.keys()))+')'
-def formula_grammar(table: PeriodicTable) -> ParserElement:
+
+def formula_grammar(table: PeriodicTable) -> "ParserElement":
     """
     Construct a parser for molecular formulas.
 
@@ -736,11 +745,22 @@ def formula_grammar(table: PeriodicTable) -> ParserElement:
             an *element* or a list of pairs (*count, fragment*).
 
     """
+    # Requires that the pyparsing module is installed.
+
+    from pyparsing import (
+        Literal, Optional, White, Regex, ZeroOrMore, OneOrMore, Forward, StringEnd, Group,
+        )
+
     # TODO: fix circular imports
     # This ickiness is because the formula class returned from the circular
     # import of fasta does not match the local formula class.
     from .formulas import Formula
-    from .util import from_subscript, from_superscript
+    from .util import from_subscript
+    from .lark_parse import LENGTH_UNITS, MASS_UNITS, VOLUME_UNITS
+
+    LENGTH_RE = '('+'|'.join(LENGTH_UNITS.keys())+')'
+    MASS_VOLUME_RE = '('+'|'.join(list(MASS_UNITS.keys())+list(VOLUME_UNITS.keys()))+')'
+
 
     # Recursive
     composite = Forward()
@@ -989,8 +1009,8 @@ def convert_mixture(string, location, tokens):
     grammar.set_name('Chemical Formula')
     return grammar
 
-_PARSER_CACHE: dict[PeriodicTable, ParserElement] = {}
-def parse_formula(formula_str: str, table: PeriodicTable|None=None) -> Formula:
+_PARSER_CACHE: dict[PeriodicTable, "ParserElement"] = {}
+def old_parser(formula_str: str, table: PeriodicTable|None=None) -> Formula:
     """
     Parse a chemical formula, returning a structure with elements from the
     given periodic table.
diff --git a/periodictable/lark_parse.py b/periodictable/lark_parse.py
index 581bb5c..6b2ab83 100644
--- a/periodictable/lark_parse.py
+++ b/periodictable/lark_parse.py
@@ -7,25 +7,29 @@
 from .formulas import (
     Formula, Structure,
     _mix_by_weight_pairs, _mix_by_volume_pairs,
-    VOLUME_UNITS, MASS_UNITS, LENGTH_UNITS,
     pretty as pretty_formula
 )
 from .util import from_subscript, from_superscript
 
 # TODO: valence belongs to a group rather than element
 
+# TODO: Parser can't handle meters as 'm' because it conflicts with the milli prefix
+LENGTH_UNITS = {'nm': 1e-9, 'um': 1e-6, 'μm': 1e-6, 'mm': 1e-3, 'cm': 1e-2, 'Ang': 1e-10, 'Å': 1e-10}
+MASS_UNITS = {'ng': 1e-9, 'ug': 1e-6, 'mg': 1e-3, 'g': 1e+0, 'kg': 1e+3}
+VOLUME_UNITS = {'nL': 1e-9, 'uL': 1e-6, 'mL': 1e-3, 'L': 1e+0}
+
+# TODO: use grammar string directly in the sphinx/guide/formula_grammar.rst
 grammar = """
-start      : SPACE? formula SPACE? # strip blank space from start and end
+start      : SPACE? formula SPACE?  # strip blank space from start and end
 formula    : compound | mixture
 
 # Mixture definitions:  quantity compound // quantity compound // quantity compound
 # Activation only cares about total mass, so you can freely mix masses and volumes if
-# you have the density for each component. Scattering cares about density of the mixture,
-# which in general is different from the mixture of densities.
-# To convert layers to masses for activation estimates we need density. Also need to scale by
-# area to convert density and thickness to mass. Assume unit area is cm^2, so for
-# example "4 (5 nm Ni // 2 mm Si)" is a 4 cm^2 wafer of nickel on silicon. If you
-# were to add a polymer you would need its density: "4 (20 nm C5H10@1.2
+# you have the density for each component. For scattering you need the density of the
+# mixture. When this is different from the mixture of densities use (mixture)@density.
+# For thin film samples, allow stacking of layers with the thickness of each layer.
+# With density for each layer the relative quantities of each element in the stack can
+# be calculated. Convert to mass by multiplying by thickness (cm) and area (cm²).
 
 mixture    : byamount | byvolume | byweight | layers
 byamount   : quantity compound (MIX quantity compound)*
@@ -38,49 +42,48 @@
 thickness  : NUMBER SPACE? LENGTH SPACE
 percentage : NUMBER SPACE? "%" SPACE  # Allows "3 % "
 
-# Compound definition: number group ... @ density where group is El count El count ...
+# Composite: number group ... @density where group is El count El count ...
+# Density applies to the entire composite, such as "NaCl + 29.2H2O @ 1.07n"
+# For the density of a mixture you need parentheses: "(10 wt% NaCl // H2O)@1.07n"
 # FASTA sequences: (rna|dna|aa) : SEQUENCE @ density
-# Density applies to the entire formula, such as "NaCl + 29.2H2O @ 1.07n"
-# If you do this as a mixture you need parentheses: "(10 wt% NaCl // H2O)@1.07n"
-# Note: `[token]` leaves a None placeholder in the tree, unlike `token?`
+# Note: optional `[token]` leaves a None placeholder in the tree, unlike `token?`
 compound   : (composite | fasta) [density]
 fasta      : FASTA ":" SEQUENCE
-FASTA      : /[a-z]+/ # Generic "str:sequence" syntax allows better error reporting
-#FASTA     : /dna|rna|aa/
-SEQUENCE   : /[A-Z -*]+/
 composite  : [NUMBER] group (SEPARATOR [NUMBER] group)*
 group      : ((atom | isoatom | "(" formula ")") [COUNT])+
 atom       : SYMBOL [isotope] [valence]
-isoatom    : SUPERINT SYMBOL [valence]
-# could list all elements, but better error reporting if element symbol lookup fails
-SYMBOL     : /[A-Z][a-z]*/
+isoatom    : SUPERINT SYMBOL [valence]    # For example ²H for deuterium
 isotope    : "[" INTEGER "]"
 valence    : "{" [INTEGER] CHARGE "}" | [SUPERINT] SUPERCHARGE
 density    : SPACE? "@" SPACE? DENSITY [DENSITYMODE]
-DENSITY    : NUMBER  # using alias DENSITY for number for better error reporting
 
 # Tokens
+#FASTA     : /dna|rna|aa/  # Sequence type is limited to these values but ...
+FASTA      : /[a-z]+/      # "str:sequence" syntax allows better error reporting
+SEQUENCE   : /[-A-Z *]+/
+# could list all elements, but better error reporting if element symbol lookup fails
+SYMBOL     : /[A-Z][a-z]*/
 CHARGE     : /[+]+|[-]+/  # allow valence using {++} or {--}
-SUPERINT   : /(\u2070|[\u00B9\u00B2\u00B3\u2074-\u2079][\u2070\u00B9\u00B2\u00B3\u2074-\u2079]*)/
-SUPERCHARGE: /\u207A+|\u207B+/  # Allow Ca++ and Cl- using superscript + and -
-DENSITYMODE: /[ni]/
+DENSITY    : NUMBER  # using alias DENSITY for number for better error reporting
+DENSITYMODE: /[ni]/       # n=natural density, i=isotopic density
 MIX        : SPACE? "//" SPACE?
-# maybe drop "wt%" and "vol%"
 WEIGHTPCT  : /%w((eigh)?t)?/ | /w((eigh)?t)?%/ | /%m(ass)?/ | /m(ass)?%/
 VOLUMEPCT  : /%v(ol(ume)?)?/ | /v(ol(ume)?)?%/
 MASS       : "kg" | "g" | "mg" | "ug" | "μg" | "ng"
 VOLUME     : "L" | "mL" | "uL" | "μL" | "nL"
 LENGTH     : "cm" | "mm" | "um" | "μm" | "nm" | "Ang" | "Å"
+COUNT      : NUMBER | SUBNUM  # atom counts can be normal numbers or unicode subscripts
 
 SEPARATOR  : SPACE? /[+•·]/ SPACE? | SPACE
 SPACE      : /[ \\t\\n\\r]+/
-COUNT      : NUMBER | SUBNUM  # atom counts can be normal numbers or unicode subscripts
 NUMBER     : INTEGER | FRACTION
 INTEGER    : /[1-9][0-9]*/
 FRACTION   : /([1-9][0-9]*|0)?[.][0-9]*/  # allow all floats?
 SUBNUM     : SUBINT | SUBFRAC
 SUBINT     : /(\u2080|[\u2081-\u2089][\u2080-\u2089]*)/
 SUBFRAC    : /(\u2080|[\u2081-\u2089][\u2080-\u2089]*|)([.][\u2080-\u2089]*)/
+SUPERINT   : /(\u2070|[\u00B9\u00B2\u00B3\u2074-\u2079][\u2070\u00B9\u00B2\u00B3\u2074-\u2079]*)/
+SUPERCHARGE: /\u207A+|\u207B+/  # Allow Ca++ and Cl- using superscript + and -
 """
 
 # propagate_positions saves start_pos and end_pos for each rule as well as each terminal.
@@ -356,17 +359,16 @@ def composite(self, tokens) -> Structure:
         return pairs
 
     def fasta(self, tokens) -> Structure:
-        """
+        r"""
         Returns the formula corresponding to the FASTA sequence, with the natural
         density set. Labile hydrogen use H[1] in the formula.
 
         The extra level of nesting in the return value is so that the fasta structure
         is like a composite with a single group containing a nested formula.
 
-        Transform: [ /aa|dna|rna/, /[A-Z -*]+/ ] => (1, ((1, formula),))
+        Transform: [ 'aa|dna|rna', '[-A-Z \*]+' ] => (1, ((1, formula),))
 
-        Example dna:CAGT: ['dna', 'CAGT']
-        => ((1, ((1, formula('C39H37H[1]10N15O25P4')),)),)
+        Example: dna:CAGT: ['dna', 'CAGT'] x=> ((1, ((1, formula('C39H37H[1]10N15O25P4')),)),)
         """
         # TODO: fasta is ignoring table when parsing
         # TODO: avoid circular imports
@@ -596,7 +598,7 @@ def layers(self, tokens) -> Formula:
         total = sum(values)
         percent = [(m/total)*100 for m in values]
         formula = _mix_by_volume_pairs(zip(tokens[1::2], percent))
-        formula.thickness = total
+        formula.thickness = 100*total # convert meters to centimeters for cgs units
         return formula
 
     def mixture(self, tokens) -> Formula:
@@ -641,14 +643,13 @@ def start(self, tokens) -> Formula:
         """
         Return the final formula, with the original text attached.
 
-        Sets formula.source to 'parse string' before returning.
+        Sets formula.name to the parser input string before returning.
 
         Transform: [formula] => formula
         """
         formula = tokens[0]
-        # TODO: add the source string to the formula class attributes
         # Remember the string which was parsed
-        formula.source = self._context
+        formula.name = self._context
         return formula
 
 # TODO: if the next character is ":" then report error as bad fasta sequence type
@@ -830,7 +831,7 @@ def parse_formula(formula_str: str, table: PeriodicTable|None=None) -> Formula:
 """
 
 def check():
-    from periodictable.formulas import parse_formula as old_parser
+    from periodictable.formulas import old_parser
 
     for line in examples.split('\n'):
         formula = line.split('#')[0]
@@ -868,7 +869,11 @@ def main():
 
     if len(sys.argv) > 1:
         for arg in sys.argv[1:]:
-            print(parse_formula(arg))
+            formula = parse_formula(arg)
+            mass = f" {formula.total_mass:.4g} g" if formula.total_mass else ""
+            density = f"@{formula.density:.4g}" if formula.density else ""
+            thickness = f" {10*formula.thickness:.4g} mm" if formula.thickness else ""
+            print(f"{formula}{density}{mass}{thickness}")
     else:
         check()
 
diff --git a/pyproject.toml b/pyproject.toml
index 1b651ef..c4af2ef 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,7 +8,8 @@
     ]
     license = { file = "LICENSE.txt" }
     dependencies = [
-        "pyparsing>=3.0.0", "numpy",
+        "numpy",
+        "lark",
     ]
 
     classifiers = [
@@ -29,7 +30,7 @@
     # Matplotlib and uncertainties are optional packages, used for making
     # plots in the docs and generating neutron data tables for the web.
     # mypy checks all code so they are needed for testing as well.
-    optional = ["uncertainties", "matplotlib"]
+    optional = ["uncertainties", "matplotlib", "pytparsing>=3.0.0"]
     docs = ["sphinx", {include-group = "optional"}]
     test = ["pytest", "pytest-cov", "pytest-mypy", {include-group = "optional"}]
     dev = [
diff --git a/test/test_formulas.py b/test/test_formulas.py
index eda94de..4eb2b9e 100644
--- a/test/test_formulas.py
+++ b/test/test_formulas.py
@@ -14,12 +14,19 @@ def check_parse_fails(s):
     raise Exception(f'formula("{s}") should fail to parse')
 
 def test():
+    # CaCO3(H2O)6 is a tuple of (count, atom) followed by (6, H2O)
+    # CaCO3+6H2O is ((1, CaCO3), (6, H2O))
     ikaite = formula()
-    # Note: this should be a tuple of tuples
     ikaite.structure = ((1, Ca), (1, C), (3, O), (6, ((2, H), (1, O))))
+    ikaite.name = "CaCO3(H2O)6"
+    ikaite_grouped = formula()
+    ikaite_grouped.structure = ((1, ((1, Ca), (1, C), (3, O))), (6, ((2, H), (1, O))))
+    ikaite_grouped.name = "CaCO3+6H2O"
 
     # Test print
     assert str(ikaite) == "CaCO3(H2O)6"
+    assert str(ikaite_grouped) == "CaCO3(H2O)6"
+    # TODO: parsing a printed structure should produce the same structure
 
     # Test constructors
     assert ikaite == formula([(1, Ca), (1, C), (3, O), (6, [(2, H), (1, O)])])
@@ -31,9 +38,9 @@ def test():
     assert formula("Ca") == formula([(1, Ca)])
     assert formula("Ca") == formula(Ca)
     assert formula("CaCO3") == formula([(1, Ca), (1, C), (3, O)])
-    assert ikaite == formula("CaCO3+6H2O")
-    assert ikaite == formula("(CaCO3+6H2O)1")
-    assert ikaite == formula("CaCO3 6H2O")
+    assert ikaite_grouped == formula("CaCO3+6H2O")
+    assert ikaite_grouped == formula("(CaCO3+6H2O)1")
+    assert ikaite_grouped == formula("CaCO3 6H2O")
     assert ikaite == formula("CaCO3(H2O)6")
     assert ikaite == formula("(CaCO3(H2O)6)1")
     assert ikaite.hill == formula("CCaO3(H2O)6").hill
@@ -43,7 +50,7 @@ def test():
     # Unicode, latex and html subscripts
     assert formula([(0.75, Fe), (0.25, Ni)]) == formula("Fe₀.₇₅Ni₀.₂₅")
     assert ikaite == formula("CaCO₃(H₂O)₆")
-    assert ikaite == formula("CaCO₃6H₂O") # with subscripts we know it isn't O36
+    assert ikaite_grouped == formula("CaCO₃ 6H₂O") # with subscripts we know it isn't O36
     assert pretty(ikaite, 'unicode') == "CaCO₃(H₂O)₆"
     assert pretty(ikaite, 'html') == "CaCO<sub>3</sub>(H<sub>2</sub>O)<sub>6</sub>"
     assert pretty(ikaite, 'latex') == "CaCO$_{3}$(H$_{2}$O)$_{6}$"
@@ -116,14 +123,15 @@ def test():
 
     # Check that names work
     permalloy = formula('Ni8Fe2', 8.692, name='permalloy')
-    assert str(permalloy) == 'permalloy'
+    assert str(permalloy) == 'Ni8Fe2'
+    assert permalloy.name == 'permalloy'
 
     # Check that get/restore state works
     assert deepcopy(permalloy).__dict__ == permalloy.__dict__
 
     # Check that copy constructor works
-    #print permalloy.__dict__
-    #print formula(permalloy).__dict__
+    # print(permalloy.__dict__)
+    # print(formula(permalloy).__dict__)
     assert formula(permalloy).__dict__ == permalloy.__dict__
     assert formula('Si', name='Silicon').__dict__ != formula('Si').__dict__
 

From 42278e633b372dcb840c4add2a6e07e4c0730da0 Mon Sep 17 00:00:00 2001
From: Paul Kienzle <pkienzle@nist.gov>
Date: Thu, 21 May 2026 17:22:40 -0400
Subject: [PATCH 12/19] remove pyparsing dependency

---
 ChangeLog.rst               |   7 +
 periodictable/formulas.py   | 299 ------------------------------------
 periodictable/lark_parse.py |   4 -
 pyproject.toml              |   2 +-
 4 files changed, 8 insertions(+), 304 deletions(-)

diff --git a/ChangeLog.rst b/ChangeLog.rst
index 8a7efa9..5ea8a37 100644
--- a/ChangeLog.rst
+++ b/ChangeLog.rst
@@ -23,6 +23,13 @@ Known issues
 Change history
 ==============
 
+2026-05-21 R2.2.0
+-----------------
+
+Modified:
+
+* Use lark for better error reporting from the formula parser
+
 2026-02-27 R2.1.0
 -----------------
 
diff --git a/periodictable/formulas.py b/periodictable/formulas.py
index e752e3e..8c16a1b 100644
--- a/periodictable/formulas.py
+++ b/periodictable/formulas.py
@@ -723,305 +723,6 @@ def _isotope_substitution(compound: "Formula", source: Atom, target: Atom, porti
         density = compound.density
     return formula(atoms, density=density)
 
-if TYPE_CHECKING:
-    from pyparsing import ParserElement
-
-# TODO: Grammar should be independent of table
-
-def formula_grammar(table: PeriodicTable) -> "ParserElement":
-    """
-    Construct a parser for molecular formulas.
-
-    :Parameters:
-
-        *table* = None : PeriodicTable
-             If table is specified, then elements and their associated fields
-             will be chosen from that periodic table rather than the default.
-
-    :Returns:
-        *parser* : pyparsing.ParserElement.
-            The ``parser.parse_string()`` method returns a list of
-            pairs (*count, fragment*), where fragment is an *isotope*,
-            an *element* or a list of pairs (*count, fragment*).
-
-    """
-    # Requires that the pyparsing module is installed.
-
-    from pyparsing import (
-        Literal, Optional, White, Regex, ZeroOrMore, OneOrMore, Forward, StringEnd, Group,
-        )
-
-    # TODO: fix circular imports
-    # This ickiness is because the formula class returned from the circular
-    # import of fasta does not match the local formula class.
-    from .formulas import Formula
-    from .util import from_subscript
-    from .lark_parse import LENGTH_UNITS, MASS_UNITS, VOLUME_UNITS
-
-    LENGTH_RE = '('+'|'.join(LENGTH_UNITS.keys())+')'
-    MASS_VOLUME_RE = '('+'|'.join(list(MASS_UNITS.keys())+list(VOLUME_UNITS.keys()))+')'
-
-
-    # Recursive
-    composite = Forward()
-    mixture = Forward()
-
-    # whitespace and separators
-    space = Optional(White().suppress())
-    separator = space+Literal('+').suppress()+space
-
-    # Lookup the element in the element table
-    symbol = Regex("[A-Z][a-z]?")
-    symbol.set_parse_action(lambda s, l, t: table.symbol(t[0]))
-
-    # Translate isotope
-    openiso = Literal('[').suppress()
-    closeiso = Literal(']').suppress()
-    isotope = Optional(~White()+openiso+Regex("[1-9][0-9]*")+closeiso,
-                       default='0')
-    isotope.set_parse_action(lambda s, l, t: int(t[0]) if t[0] else 0)
-
-    # Translate ion
-    openion = Literal('{').suppress()
-    closeion = Literal('}').suppress()
-    ion = Optional(~White() +openion +Regex("([1-9][0-9]*)?[+-]") +closeion,
-                   default='0+')
-    ion.set_parse_action(lambda s, l, t: int(t[0][-1]+(t[0][:-1] if len(t[0]) > 1 else '1')))
-
-    # Translate counts
-    # TODO: regex should reject a bare '.' if we want to allow dots between formula parts
-    fract = Regex("(0|[1-9][0-9]*|)([.][0-9]*)")
-    fract.set_parse_action(lambda s, l, t: float(t[0]) if t[0] else 1)
-    whole = Regex("(0|[1-9][0-9]*)")
-    whole.set_parse_action(lambda s, l, t: int(t[0]) if t[0] else 1)
-    number = Optional(~White()+(fract|whole), default=1)
-    # TODO use unicode ₀₁₉ in the code below?
-    sub_fract = Regex("(\u2080|[\u2081-\u2089][\u2080-\u2089]*|)([.][\u2080-\u2089]*)")
-    sub_fract.set_parse_action(lambda s, l, t: float(from_subscript(t[0])) if t[0] else 1)
-    sub_whole = Regex("(\u2080|[\u2081-\u2089][\u2080-\u2089]*)")
-    sub_whole.set_parse_action(lambda s, l, t: int(from_subscript(t[0])) if t[0] else 1)
-    sub_count = Optional(~White()+(fract|whole|sub_fract|sub_whole), default=1)
-
-    # Fasta code
-    fasta = Regex("aa|rna|dna") + Literal(":").suppress() + Regex("[A-Z *-]+")
-    def convert_fasta(string, location, tokens):
-        #print("fasta", string, location, tokens)
-        # TODO: fasta is ignoring table when parsing
-        # TODO: avoid circular imports
-        # TODO: support other biochemicals (carbohydrate residues, lipids)
-        from . import fasta
-        seq_type, seq = tokens
-        if seq_type not in fasta.CODE_TABLES:
-            raise ValueError(f"Invalid fasta sequence type '{seq_type}:'")
-        seq = fasta.Sequence(name=None, sequence=seq, type=seq_type)
-        return seq.labile_formula
-    fasta.set_parse_action(convert_fasta)
-
-    # Convert symbol, isotope, ion, count to (count, isotope)
-    element = symbol+isotope+ion+sub_count
-    def convert_element(string, location, tokens):
-        """interpret string as element"""
-        #print "convert_element received", tokens
-        symbol, isotope, ion, count = tokens[0:4]
-        if isotope != 0:
-            symbol = symbol[isotope]
-        if ion != 0:
-            symbol = symbol.ion[ion]
-        return (count, symbol)
-    element.set_parse_action(convert_element)
-
-    # Convert "count elements" to a pair
-    implicit_group = number+OneOrMore(element)
-    def convert_implicit(string, location, tokens):
-        """convert count followed by fragment"""
-        #print "implicit", tokens
-        count = tokens[0]
-        fragment = tokens[1:]
-        return fragment if count == 1 else (count, fragment)
-    implicit_group.set_parse_action(convert_implicit)
-
-    # Convert "(composite) count" to a pair
-    opengrp = space + Literal('(').suppress() + space
-    closegrp = space + Literal(')').suppress() + space
-    explicit_group = opengrp + composite + closegrp + sub_count
-    def convert_explicit(string, location, tokens):
-        """convert (fragment)count"""
-        #print "explicit", tokens
-        count = tokens[-1]
-        fragment = tokens[:-1]
-        return fragment if count == 1 else (count, fragment)
-    explicit_group.set_parse_action(convert_explicit)
-
-    # Build composite from a set of groups
-    group = implicit_group | explicit_group
-    implicit_separator = separator | space
-    composite << group + ZeroOrMore(implicit_separator + group)
-
-    density = Literal('@').suppress() + number + Optional(Regex("[ni]"), default='i')
-    compound = (composite|fasta) + Optional(density, default=None)
-    def convert_compound(string, location, tokens):
-        """convert material @ density or fasta @ density"""
-        # Messiness: both composite and density can be one or more tokens
-        # If density is missing then it is None, otherwise it is count + [ni]
-        # Compound can be a sequence of (count, fragment) pairs, or if it is
-        # a fasta sequence it may already be a formula.
-        material = tokens[:-1] if tokens[-1] is None else tokens[:-2]
-        #print("compound", material, type(material[0]), len(material))
-        if len(material) == 1 and isinstance(material[0], Formula):
-            formula = material[0]
-        else:
-            #print("unbundling material", material)
-            formula = Formula(structure=_immutable(material))
-        density, form = (None, None) if tokens[-1] is None else tokens[-2:]
-        #if density is None and formula.density is None:
-        #    # Estimate density from covalent radii and a 0.54 packing factor
-        #    mass = formula.molecular_mass
-        #    volume = formula.volume(packing_factor=0.54, H_radius=1.15)
-        #    density, form = mass/volume, 'n'
-        #    print(f"estimating density as {mass/volume=:.3f}")
-        if form == 'n':
-            formula.natural_density = density
-        elif form == 'i':
-            formula.density = density
-        #print("compound", formula, f"{formula.density=:.3f}")
-        return formula
-    compound.set_parse_action(convert_compound)
-
-    partsep = space + Literal('//').suppress() + space
-    percent = Literal('%').suppress()
-    weight = Regex("(w((eigh)?t)?|m(ass)?)").suppress()
-    volume = Regex("v(ol(ume)?)?").suppress()
-    weight_percent = (percent + weight) | (weight + percent) + space
-    volume_percent = (percent + volume) | (volume + percent) + space
-    mixture_by_weight = (number + weight_percent + mixture
-                 + ZeroOrMore(partsep+number+(weight_percent|percent)+mixture)
-                 + Optional(partsep + mixture, default=None))
-    def _parts_by_weight_vol(tokens):
-        #print("by weight or volume", tokens)
-        if tokens[-1] is None:
-            piece = tokens[1:-1:2]
-            fract = [float(v) for v in tokens[:-1:2]]
-            if abs(sum(fract) - 100) > 1e-12:
-                raise ValueError(f"Formula percentages must sum to 100%, not {sum(fract)}")
-        else:
-            piece = tokens[1:-1:2] + [tokens[-1]]
-            fract = [float(v) for v in tokens[:-1:2]]
-            fract.append(100-sum(fract))
-            if fract[-1] < 0:
-                raise ValueError("Formula percentages must sum to less than 100%")
-        #print piece, fract
-        if len(piece) != len(fract):
-            raise ValueError("Missing base component of mixture")
-        return piece, fract
-    def convert_by_weight(string, location, tokens):
-        """convert mixture by wt% or mass%"""
-        piece, fract = _parts_by_weight_vol(tokens)
-        return _mix_by_weight_pairs(zip(piece, fract))
-    mixture_by_weight.set_parse_action(convert_by_weight)
-
-    mixture_by_volume = (number + volume_percent + mixture
-                 + ZeroOrMore(partsep+number+(volume_percent|percent)+mixture)
-                 + Optional(partsep + mixture, default=None))
-    def convert_by_volume(string, location, tokens):
-        """convert mixture by vol%"""
-        piece, fract = _parts_by_weight_vol(tokens)
-        return _mix_by_volume_pairs(zip(piece, fract))
-    mixture_by_volume.set_parse_action(convert_by_volume)
-
-    mixture_by_layer = Forward()
-    layer_thick = Group(number + Regex(LENGTH_RE) + space)
-    layer_part = (layer_thick + mixture) | (opengrp + mixture_by_layer + closegrp + sub_count)
-    mixture_by_layer << layer_part + ZeroOrMore(partsep + layer_part)
-    def convert_by_layer(string, location, tokens):
-        """convert layer thickness '# nm material'"""
-        if len(tokens) < 2:
-            return tokens
-        piece = []
-        fract = []
-        for p1, p2 in zip(tokens[0::2], tokens[1::2]):
-            if isinstance(p1, Formula):
-                f = p1.thickness * float(p2)
-                p = p1
-            else:
-                f = float(p1[0]) * LENGTH_UNITS[p1[1]]
-                p = p2
-            piece.append(p)
-            fract.append(f)
-        total = sum(fract)
-        vfract = [(v/total)*100 for v in fract]
-        result = _mix_by_volume_pairs(zip(piece, vfract))
-        result.thickness = total
-        return result
-    mixture_by_layer.set_parse_action(convert_by_layer)
-
-    mixture_by_absmass = Forward()
-    absmass_mass = Group(number + Regex(MASS_VOLUME_RE) + space)
-    absmass_part = (absmass_mass + mixture) | (opengrp + mixture_by_absmass + closegrp + sub_count)
-    mixture_by_absmass << absmass_part + ZeroOrMore(partsep + absmass_part)
-    def convert_by_absmass(string, location, tokens):
-        """convert mass '# mg material'"""
-        if len(tokens) < 2:
-            return tokens
-        piece = []
-        fract = []
-        for p1, p2 in zip(tokens[0::2], tokens[1::2]):
-            if isinstance(p1, Formula):
-                p = p1
-                f = p1.total_mass * float(p2)
-            else:
-                p = p2
-                value = float(p1[0])
-                if p1[1] in VOLUME_UNITS:
-                    # convert to volume in liters to mass in grams before mixing
-                    if p.density is None:
-                        raise ValueError("Need the mass density of "+str(p))
-                    f = value * VOLUME_UNITS[p1[1]] * 1000.*p.density
-                else:
-                    f = value * MASS_UNITS[p1[1]]
-            piece.append(p)
-            fract.append(f)
-
-        total = sum(fract)
-        mfract = [(m/total)*100 for m in fract]
-        result = _mix_by_weight_pairs(zip(piece, mfract))
-        result.total_mass = total
-        return result
-    mixture_by_absmass.set_parse_action(convert_by_absmass)
-
-    ungrouped_mixture = (mixture_by_weight | mixture_by_volume
-                         | mixture_by_layer | mixture_by_absmass)
-    grouped_mixture = opengrp + ungrouped_mixture + closegrp + Optional(density, default=None)
-    def convert_mixture(string, location, tokens):
-        """convert (mixture) @ density"""
-        formula = tokens[0]
-        if tokens[-1] == 'n':
-            formula.natural_density = tokens[-2]
-        elif tokens[-1] == 'i':
-            formula.density = tokens[-2]
-        # elif tokens[-1] is None
-        return formula
-    grouped_mixture.set_parse_action(convert_mixture)
-
-    mixture << (compound | grouped_mixture)
-    formula = (compound | ungrouped_mixture | grouped_mixture)
-    grammar = Optional(formula, default=Formula()) + StringEnd()
-
-    grammar.set_name('Chemical Formula')
-    return grammar
-
-_PARSER_CACHE: dict[PeriodicTable, "ParserElement"] = {}
-def old_parser(formula_str: str, table: PeriodicTable|None=None) -> Formula:
-    """
-    Parse a chemical formula, returning a structure with elements from the
-    given periodic table.
-    """
-    table = default_table(table)
-    if table not in _PARSER_CACHE:
-        _PARSER_CACHE[table] = formula_grammar(table)
-    parser = _PARSER_CACHE[table]
-    #print(parser)
-    return parser.parse_string(formula_str)[0]
-
 def _count_atoms(seq: Structure) -> dict[Atom, float]:
     """
     Traverse formula structure, counting the total number of atoms.
diff --git a/periodictable/lark_parse.py b/periodictable/lark_parse.py
index 6b2ab83..d70e45d 100644
--- a/periodictable/lark_parse.py
+++ b/periodictable/lark_parse.py
@@ -831,8 +831,6 @@ def parse_formula(formula_str: str, table: PeriodicTable|None=None) -> Formula:
 """
 
 def check():
-    from periodictable.formulas import old_parser
-
     for line in examples.split('\n'):
         formula = line.split('#')[0]
         bad = line.startswith('!')
@@ -844,9 +842,7 @@ def check():
             else:
                 print(f"*** {line}")
             try:
-                # Toggle the following to test pyparsing vs lark
                 tree = parse_formula(formula)
-                #tree = old_parser(formula) if "##" not in line else "!!! pyparsing fails"
                 density = getattr(tree, 'density', None)
                 density_str = f" @ {density:.2f}" if density else ""
                 mode = 'unicode' # unicode latex html plain
diff --git a/pyproject.toml b/pyproject.toml
index c4af2ef..120f656 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,7 +30,7 @@
     # Matplotlib and uncertainties are optional packages, used for making
     # plots in the docs and generating neutron data tables for the web.
     # mypy checks all code so they are needed for testing as well.
-    optional = ["uncertainties", "matplotlib", "pytparsing>=3.0.0"]
+    optional = ["uncertainties", "matplotlib"]
     docs = ["sphinx", {include-group = "optional"}]
     test = ["pytest", "pytest-cov", "pytest-mypy", {include-group = "optional"}]
     dev = [

From 1ccb4a3e1d975ef5cd0424637eb832f8edfd5ed0 Mon Sep 17 00:00:00 2001
From: Paul Kienzle <pkienzle@nist.gov>
Date: Fri, 22 May 2026 11:58:41 -0400
Subject: [PATCH 13/19] tweak formula docs

---
 doc/sphinx/guide/formula_grammar.rst | 74 +++++++++++++++++++---------
 periodictable/lark_parse.py          | 44 +++++++++--------
 2 files changed, 73 insertions(+), 45 deletions(-)

diff --git a/doc/sphinx/guide/formula_grammar.rst b/doc/sphinx/guide/formula_grammar.rst
index da694cd..014b795 100644
--- a/doc/sphinx/guide/formula_grammar.rst
+++ b/doc/sphinx/guide/formula_grammar.rst
@@ -159,9 +159,20 @@ The grammar used for parsing formula strings is the following:
 
 ::
 
+    # formula: composite @ density | str:sequence @ density | mixture
     formula    : compound | mixture
+    compound   : (composite | fasta) [density]
+    # Density applies to the entire composite, such as "NaCl + 29.2H2O @ 1.07n"
+    # For the density of a mixture you need parentheses: "(10 wt% NaCl // H2O)@1.07n"
+
+    # Activation only cares about total mass, so you can freely mix masses and volumes if
+    # you have the density for each component. For scattering you need the density of the
+    # mixture. When this is different from the mixture of densities use (mixture)@density.
+    # For thin film samples, allow stacking of layers with the thickness of each layer.
+    # With density for each layer the relative quantities of each element in the stack can
+    # be calculated. Convert to mass by multiplying density by thickness (cm) and area (cm²).
 
-    # Mixture definitions:  quantity compound // quantity compound // quantity compound
+    # mixture:  quantity compound // quantity compound // ...
     mixture    : byamount | byvolume | byweight | layers
     byamount   : quantity compound (MIX quantity compound)*
     byvolume   : volumepct compound (MIX percentage compound)* MIX compound
@@ -172,13 +183,20 @@ The grammar used for parsing formula strings is the following:
     volumepct  : NUMBER SPACE? VOLUMEPCT SPACE
     thickness  : NUMBER SPACE? LENGTH SPACE
     percentage : NUMBER SPACE? "%" SPACE  # Allows "3 % "
+    MIX        : SPACE? "//" SPACE?
+    WEIGHTPCT  : /%w((eigh)?t)?/ | /w((eigh)?t)?%/ | /%m(ass)?/ | /m(ass)?%/
+    VOLUMEPCT  : /%v(ol(ume)?)?/ | /v(ol(ume)?)?%/
+    MASS       : "kg" | "g" | "mg" | "ug" | "μg" | "ng"
+    VOLUME     : "L" | "mL" | "uL" | "μL" | "nL"
+    LENGTH     : "cm" | "mm" | "um" | "μm" | "nm" | "Ang" | "Å"
 
-    # Compound definition: number group ... @density where group is El count El count ...
-    # FASTA sequences: (rna|dna|aa) : SEQUENCE @ density
-    # Density applies to the entire formula, such as "NaCl + 29.2H2O @ 1.07n"
-    # For the density of a mixture you need parentheses: "(10 wt% NaCl // H2O)@1.07n"
-    compound   : (composite | fasta) [density]
+    # FASTA sequence:   (rna|dna|aa):SEQUENCE @ density
     fasta      : FASTA ":" SEQUENCE
+    FASTA      : /[a-z]+/  # str:sequence reports better errors than /dna|rna|aa/:sequence
+    SEQUENCE   : /[-A-Z *]+/
+
+    # composite: number group number group ... @density
+    # group: El count El count ...
     composite  : [NUMBER] group (SEPARATOR [NUMBER] group)*
     group      : ((atom | isoatom | "(" formula ")") [COUNT])+
     atom       : SYMBOL [isotope] [valence]
@@ -186,23 +204,23 @@ The grammar used for parsing formula strings is the following:
     isotope    : "[" INTEGER "]"
     valence    : "{" [INTEGER] CHARGE "}" | [SUPERINT] SUPERCHARGE
     density    : SPACE? "@" SPACE? DENSITY [DENSITYMODE]
-
-    # Tokens
-    #FASTA     : /dna|rna|aa/  # Sequence type is limited to these values but ...
-    FASTA      : /[a-z]+/      # "type:sequence" syntax allows better error reporting
-    SEQUENCE   : /[-A-Z *]+/
     # could list all elements, but better error reporting if element symbol lookup fails
     SYMBOL     : /[A-Z][a-z]*/
     CHARGE     : /[+]+|[-]+/  # allow valence using {++} or {--}
+    SUPERCHARGE: /\u207A+|\u207B+/ # unicode valence such as Ca⁺⁺ and O²⁻
     DENSITY    : NUMBER  # using alias DENSITY for number for better error reporting
     DENSITYMODE: /[ni]/       # n=natural density, i=isotopic density
-    MIX        : SPACE? "//" SPACE?
-    WEIGHTPCT  : /%w((eigh)?t)?/ | /w((eigh)?t)?%/ | /%m(ass)?/ | /m(ass)?%/
-    VOLUMEPCT  : /%v(ol(ume)?)?/ | /v(ol(ume)?)?%/
-    MASS       : "kg" | "g" | "mg" | "ug" | "μg" | "ng"
-    VOLUME     : "L" | "mL" | "uL" | "μL" | "nL"
-    LENGTH     : "cm" | "mm" | "um" | "μm" | "nm" | "Ang" | "Å"
     COUNT      : NUMBER | SUBNUM  # atom counts can be normal numbers or unicode subscripts
+    SEPARATOR  : SPACE? /[+•·]/ SPACE? | SPACE   # For example, CaCO₃·6H₂O
+
+    SPACE      : /[ \\t\\n\\r]+/
+    NUMBER     : INTEGER | FRACTION
+    INTEGER    : /[1-9][0-9]*/
+    FRACTION   : /([1-9][0-9]*|0)?[.][0-9]*/  # allow all floats?
+    SUBNUM     : SUBINT | SUBFRAC
+    SUBINT     : /(\u2080|[\u2081-\u2089][\u2080-\u2089]*)/
+    SUBFRAC    : /(\u2080|[\u2081-\u2089][\u2080-\u2089]*|)([.][\u2080-\u2089]*)/
+    SUPERINT   : /(\u2070|[\u00B9\u00B2\u00B3\u2074-\u2079][\u2070\u00B9\u00B2\u00B3\u2074-\u2079]*)/
 
 Formulas can also be constructed from atoms or other formulas:
 
@@ -281,18 +299,26 @@ following is a 2:1 mixture of water and heavy water:
     >>> H2O = formula('H2O',natural_density=1)
     >>> D2O = formula('D2O',natural_density=1)
     >>> mix = mix_by_volume(H2O,2,D2O,1)
-    >>> print(f"{mix} {mix.density:.4g}")
-    (H2O)2D2O 1.037
+    >>> print(f"{mix} @ {mix.density:.4g}")
+    (H2O)2D2O @ 1.037
 
-Note that this is different from a 2:1 mixture by weight:
+This is different from a 2:1 mixture by weight:
 
     >>> mix = mix_by_weight(H2O,2,D2O,1)
-    >>> print(f"{mix} {mix.density:.4g}")
-    (H2O)2.22339D2O 1.035
+    >>> print(f"{mix} @ {mix.density:.4g}")
+    (H2O)2.22339D2O @ 1.035
 
 Except in the simplest of cases, the density of the mixture cannot be
-computed from the densities of the components, and the resulting density
-should be set explicitly.
+computed from the densities of the components. Even when the component
+density is known the resulting density should be set explicitly:
+
+    >>> mix = mix_by_weight("NaCl@2.17", 0.1, "H2O@1", 0.9)
+    >>> print(f"{mix} @ {mix.density:.4g}")
+    NaCl(H2O)29.1956 @ 1.057
+    >>> mix = mix_by_weight("NaCl@2.17", 0.1, "H2O@1", 0.9, density=1.07)
+    >>> print(f"{mix} @ {mix.density:.4g}")
+    NaCl(H2O)29.1956 @ 1.07
+
 
 Derived values
 --------------
diff --git a/periodictable/lark_parse.py b/periodictable/lark_parse.py
index d70e45d..6a696e8 100644
--- a/periodictable/lark_parse.py
+++ b/periodictable/lark_parse.py
@@ -19,18 +19,24 @@
 VOLUME_UNITS = {'nL': 1e-9, 'uL': 1e-6, 'mL': 1e-3, 'L': 1e+0}
 
 # TODO: use grammar string directly in the sphinx/guide/formula_grammar.rst
+# Any changes to the grammar below should be copied to formula_grammar.rst
 grammar = """
 start      : SPACE? formula SPACE?  # strip blank space from start and end
+
+# formula: composite @ density | str:sequence @ density | mixture
 formula    : compound | mixture
+compound   : (composite | fasta) [density]
+# Density applies to the entire composite, such as "NaCl + 29.2H2O @ 1.07n"
+# For the density of a mixture you need parentheses: "(10 wt% NaCl // H2O)@1.07n"
 
-# Mixture definitions:  quantity compound // quantity compound // quantity compound
 # Activation only cares about total mass, so you can freely mix masses and volumes if
 # you have the density for each component. For scattering you need the density of the
 # mixture. When this is different from the mixture of densities use (mixture)@density.
 # For thin film samples, allow stacking of layers with the thickness of each layer.
 # With density for each layer the relative quantities of each element in the stack can
-# be calculated. Convert to mass by multiplying by thickness (cm) and area (cm²).
+# be calculated. Convert to mass by multiplying density by thickness (cm) and area (cm²).
 
+# mixture:  quantity compound // quantity compound // quantity compound
 mixture    : byamount | byvolume | byweight | layers
 byamount   : quantity compound (MIX quantity compound)*
 byvolume   : volumepct compound (MIX percentage compound)* MIX compound
@@ -41,14 +47,21 @@
 volumepct  : NUMBER SPACE? VOLUMEPCT SPACE
 thickness  : NUMBER SPACE? LENGTH SPACE
 percentage : NUMBER SPACE? "%" SPACE  # Allows "3 % "
+MIX        : SPACE? "//" SPACE?
+WEIGHTPCT  : /%w((eigh)?t)?/ | /w((eigh)?t)?%/ | /%m(ass)?/ | /m(ass)?%/
+VOLUMEPCT  : /%v(ol(ume)?)?/ | /v(ol(ume)?)?%/
+MASS       : "kg" | "g" | "mg" | "ug" | "μg" | "ng"
+VOLUME     : "L" | "mL" | "uL" | "μL" | "nL"
+LENGTH     : "cm" | "mm" | "um" | "μm" | "nm" | "Ang" | "Å"
 
-# Composite: number group ... @density where group is El count El count ...
-# Density applies to the entire composite, such as "NaCl + 29.2H2O @ 1.07n"
-# For the density of a mixture you need parentheses: "(10 wt% NaCl // H2O)@1.07n"
-# FASTA sequences: (rna|dna|aa) : SEQUENCE @ density
-# Note: optional `[token]` leaves a None placeholder in the tree, unlike `token?`
-compound   : (composite | fasta) [density]
+# FASTA sequence:   (rna|dna|aa):SEQUENCE @ density
 fasta      : FASTA ":" SEQUENCE
+FASTA      : /[a-z]+/  # str:sequence reports better errors than /dna|rna|aa/:sequence
+SEQUENCE   : /[-A-Z *]+/
+
+# composite: number group number group ... @density
+# group: El count El count ...
+# Note: optional `[token]` leaves a None placeholder in the tree, unlike `token?`
 composite  : [NUMBER] group (SEPARATOR [NUMBER] group)*
 group      : ((atom | isoatom | "(" formula ")") [COUNT])+
 atom       : SYMBOL [isotope] [valence]
@@ -56,25 +69,15 @@
 isotope    : "[" INTEGER "]"
 valence    : "{" [INTEGER] CHARGE "}" | [SUPERINT] SUPERCHARGE
 density    : SPACE? "@" SPACE? DENSITY [DENSITYMODE]
-
-# Tokens
-#FASTA     : /dna|rna|aa/  # Sequence type is limited to these values but ...
-FASTA      : /[a-z]+/      # "str:sequence" syntax allows better error reporting
-SEQUENCE   : /[-A-Z *]+/
 # could list all elements, but better error reporting if element symbol lookup fails
 SYMBOL     : /[A-Z][a-z]*/
 CHARGE     : /[+]+|[-]+/  # allow valence using {++} or {--}
+SUPERCHARGE: /\u207A+|\u207B+/ # unicode valence such as Ca⁺⁺ and O²⁻
 DENSITY    : NUMBER  # using alias DENSITY for number for better error reporting
 DENSITYMODE: /[ni]/       # n=natural density, i=isotopic density
-MIX        : SPACE? "//" SPACE?
-WEIGHTPCT  : /%w((eigh)?t)?/ | /w((eigh)?t)?%/ | /%m(ass)?/ | /m(ass)?%/
-VOLUMEPCT  : /%v(ol(ume)?)?/ | /v(ol(ume)?)?%/
-MASS       : "kg" | "g" | "mg" | "ug" | "μg" | "ng"
-VOLUME     : "L" | "mL" | "uL" | "μL" | "nL"
-LENGTH     : "cm" | "mm" | "um" | "μm" | "nm" | "Ang" | "Å"
 COUNT      : NUMBER | SUBNUM  # atom counts can be normal numbers or unicode subscripts
+SEPARATOR  : SPACE? /[+•·]/ SPACE? | SPACE   # For example, CaCO₃·6H₂O
 
-SEPARATOR  : SPACE? /[+•·]/ SPACE? | SPACE
 SPACE      : /[ \\t\\n\\r]+/
 NUMBER     : INTEGER | FRACTION
 INTEGER    : /[1-9][0-9]*/
@@ -83,7 +86,6 @@
 SUBINT     : /(\u2080|[\u2081-\u2089][\u2080-\u2089]*)/
 SUBFRAC    : /(\u2080|[\u2081-\u2089][\u2080-\u2089]*|)([.][\u2080-\u2089]*)/
 SUPERINT   : /(\u2070|[\u00B9\u00B2\u00B3\u2074-\u2079][\u2070\u00B9\u00B2\u00B3\u2074-\u2079]*)/
-SUPERCHARGE: /\u207A+|\u207B+/  # Allow Ca++ and Cl- using superscript + and -
 """
 
 # propagate_positions saves start_pos and end_pos for each rule as well as each terminal.

From 40a1b6ceb076206fce54f2650f132ca9c7247d09 Mon Sep 17 00:00:00 2001
From: Paul Kienzle <pkienzle@nist.gov>
Date: Fri, 22 May 2026 12:09:55 -0400
Subject: [PATCH 14/19] improve output of error handling demo python -m
 periodictable.lark_parse

---
 periodictable/lark_parse.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/periodictable/lark_parse.py b/periodictable/lark_parse.py
index 6a696e8..0b19aa6 100644
--- a/periodictable/lark_parse.py
+++ b/periodictable/lark_parse.py
@@ -839,6 +839,7 @@ def check():
         if bad:
             formula = formula[1:]
         if formula:
+            print()
             if bad:
                 print(f"!!! {line[1:]}")
             else:

From abe50587849a24cbce44b4230acd2188c80ce44f Mon Sep 17 00:00:00 2001
From: Paul Kienzle <pkienzle@nist.gov>
Date: Fri, 22 May 2026 12:22:47 -0400
Subject: [PATCH 15/19] attempt to fix missing lark on CI

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 120f656..9182798 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,6 +37,7 @@
         {include-group = "build"},
         {include-group = "docs"},
         {include-group = "test"},
+        "periodictable",
     ]
 
 [project.urls]

From e2ce8f7b2c968c0d19aa7d82b86ac3ada28e1f1b Mon Sep 17 00:00:00 2001
From: Paul Kienzle <pkienzle@nist.gov>
Date: Fri, 22 May 2026 12:24:30 -0400
Subject: [PATCH 16/19] attempt to fix missing lark on CI

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 9182798..b40f53e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,7 +37,7 @@
         {include-group = "build"},
         {include-group = "docs"},
         {include-group = "test"},
-        "periodictable",
+        "numpy", "lark",
     ]
 
 [project.urls]

From cc49581efe6eacdbd5f8ec1edd9c29f3b64759dc Mon Sep 17 00:00:00 2001
From: Paul Kienzle <pkienzle@nist.gov>
Date: Fri, 22 May 2026 12:26:36 -0400
Subject: [PATCH 17/19] attempt to fix missing lark on CI

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index b40f53e..a327bf0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,6 +37,7 @@
         {include-group = "build"},
         {include-group = "docs"},
         {include-group = "test"},
+        # TODO: Shouldn't have to copy base dependencies here...is there a better way?
         "numpy", "lark",
     ]
 

From e3308e643e7b9d5319fc576ee18317c1c363b277 Mon Sep 17 00:00:00 2001
From: Paul Kienzle <pkienzle@nist.gov>
Date: Fri, 29 May 2026 14:02:43 -0400
Subject: [PATCH 18/19] Fix conversion from 8 and 9 to superscript 8 and 9

---
 periodictable/util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/periodictable/util.py b/periodictable/util.py
index 0b7267c..5884f1f 100644
--- a/periodictable/util.py
+++ b/periodictable/util.py
@@ -116,7 +116,7 @@ def unicode_superscript(value: str) -> str:
         '1': '\u00B9',
         '0': '\u2070', 'i': '\u2071',
         '4': '\u2074', '5': '\u2075', '6': '\u2076', '7': '\u2077',
-        '9': '\u2078', '0': '\u2079', '+': '\u207a', '-': '\u207b',
+        '8': '\u2078', '9': '\u2079', '+': '\u207a', '-': '\u207b',
         '=': '\u207c', '(': '\u207d', ')': '\u207e', 'n': '\u207f',
 
         '\u2013': '\u207b', # en-dash is same as dash

From 12e1a06ac27f0bd66c43659e94fffe719bbe0e2e Mon Sep 17 00:00:00 2001
From: Paul Kienzle <pkienzle@nist.gov>
Date: Fri, 29 May 2026 15:09:25 -0400
Subject: [PATCH 19/19] group lark parsing examples by what is being tested

---
 periodictable/lark_parse.py | 150 +++++++++++++++++++++---------------
 1 file changed, 86 insertions(+), 64 deletions(-)

diff --git a/periodictable/lark_parse.py b/periodictable/lark_parse.py
index 0b19aa6..f19efd0 100644
--- a/periodictable/lark_parse.py
+++ b/periodictable/lark_parse.py
@@ -749,86 +749,106 @@ def parse_formula(formula_str: str, table: PeriodicTable|None=None) -> Formula:
     return tree
 
 # Error conditions are marked with '!' so the exception is ignored
-# Lines marked ## fail on the existing parser
+# Lines marked ## fail on the pyparsing parser
 examples = """
-! DNA:CAGT  # incorrect case for FASTA type not properly identified
-! dna CAGT  # missing colon in FASTA
-! O²  # SUPERCHARGE should be the only valid token here
-! ₃H2O  # badly placed subscript
-! // 3g Ca  # // is not a comment
-! 3g Ca@ // 5g Si # missing density value
-! Ca@i  # missing density value  ##
-! Ca ⁺⁺  # extra space before valence
-! Ca++  # missing braces in valence: the + is acting as SEPARATOR
-! Ca2+  # missing braces in valence: the 2 is acting as COUNT and the + as SEPARATOR
-! Ca{2}  # missing charge in valence
-! 37 vol% H2O@1 / 5% D2O@1  # missing /
-! 37 vol% H2O@1 /// 5% D2O@1  # extra /
-! H2O@1h  # bad density mode
-! 37 vol% NaCl@2.16 // H2O@1 // D2O@1  # percent missing in middle part
-! 37 vol% H2O@1 // 5% D2O@1  # percent not allowed in last part
-! 37 vol% H2O@1 // 5 vol% D2O@1  # only % in subsequent parts
-! 37% H2O@1 // D2O@1  # missing vol% or wt%
-! 37 val% H2O@1 // D2O@1  # bad spelling of vol%
-! Fe[56O2 # bad isotope syntax
-! Co[181]  # bad isotope
-! Ca{2+O2  # bad valence syntax
-! Co{17-}  # bad valence
-! 3..5 mg NaCl
-! 3.5 fm Si # bad units at the start; could be wt%/vol% or LENGTH, VOLUME, MASS 
-! 3.5 mm Si // 2.5 nm SiO2 //
-! 3.5 mm Si // 2.5 nm SiO2 // 35 mm cG
-! ((Co) # mismatched LPAR
-! Co)  # mismatched RPAR
-! bad:CAGT  # bad sequence type
+
+# === Composite tests ===
 Co
-dna:CAGT
-(Co@5) ##
-(((Co@5)@6)) ##
+H2SO4
 CaCO3
 CaCO₃
+(Co@5)       ##
+(((Co@5)@6)) ##
 CaCO3+6H2O
 CaCO3 6H2O
 CaCO3(H2O)6
 CaCO3 (H2O)6
 (Ca(CO3)((H2O)6))
-CaCO₃·6H₂O  ##
+CaCO₃·6H₂O   ##
+! Bl2Oh   # bad symbol
+! (Co     # mismatched LPAR
+! Co)     # mismatched RPAR
+! ((Co)   # mismatched LPAR
+! ₃H2O    # badly placed subscript
+
+# === Isotope tests ===
 DHO
-!Ca{2++}  # bad valence string
-Ca⁺⁺  # also Ca{2+}  ##
-O²⁻   ##
 H[1]
-²H⁺    # D{+} ##
-O²H⁻   # OD{-} ##
-O²⁻H⁺  # O{2-}H{+} ##
-O²⁻²H⁺ # O{2-}D{+} ##
-H2O@1
-D2O@1n
-D2O @ 1.11  ##
-D2O@1.11i
-HO{1-}
+¹⁸O₂
+! Fe[56O2  # bad isotope syntax
+! Co[181]  # bad isotope
+
+# === Valence tests ===
+Ca{2+}
+Ca{++}
+Ca⁺⁺   ##
+O{2-}
+O{--}
+O²⁻    ##
+H{+}
+H{-}
+HO{1-}    # HO- applies to the group, but valence is attached to O
 H[1]{1-}O
-H2SO4
-C3H4H[1]NO@1.29n
+²H⁺       # D{+} ##
+O²H⁻      # no ambiguity since valence requires a trailing + or - ##
+O²⁻H⁺     # O{2-}H{+} ##
+O²⁻²H⁺    # O{2-}D{+} ##
+! Ca{2}   # missing charge in valence
+! Ca{2++} # can't use number++
+! Ca{2+O2 # missing close brace on valence
+! Co{17-} # bad valence value
+! Ca ⁺⁺   # extra space before valence
+! Ca++    # missing braces in valence: the + is acting as SEPARATOR
+! Ca2+    # missing braces in valence: the 2 is acting as COUNT and the + as SEPARATOR
+! O²      # Should be looking for SUPERCHARGE (e.g., O²⁻) or SYMBOL (e.g., O²H)
+
+# === Density tests ===
+H2O@1               # density is 1, where H and O use natural abundance
+H2O @ 1             # spaces allowed around '@' ##
+D2O@1n              # natural density "n" is 1 so isotopic density is 1.11
+D2O@1.11i           # isotopic density is 1.11
+D2O@1.11            # default is "i" for isotopic density
+C3H4H[1]NO@1.29n    # another natural density example
 78.2H2O[16] + 21.8H2O[18] @1n  # density applies to composite
-dna:CAGT @1n  # fasta density override
-50 wt% Co // Ti
-33 wt% Co // 33% Fe // Ti
-! 93 wt% Co // 33% Fe // Ti  # More than 100 wt%
-! 93 vol% Co // 33% Fe // Ti  # More than 100 vol%
+! 3g Ca@ // 5g Si   # missing density value
+! Ca@i              # missing density value  ##
+! H2O@1h            # bad density mode
+
+# === Mixture tests ===
+50 wt% Co // Ti                 # mix by mass; final component does need percentage
+33 wt% Co // 33% Fe // Ti       # intermediate components need percentage
+! 93 wt% Co // 33% Fe // Ti     # more than 100 wt%
+! 93 vol% Co // 33% Fe // Ti    # more than 100 vol%
 20 vol% (10 wt% NaCl@2.16 // H2O@1) // D2O@1n
-NaCl(H2O)29.1966(D2O)122.794@1.10i
-5g NaCl // 50mL H2O@1
-5g NaCl@2.16 // 50mL H2O@1
-! 5g NaCl // 50mL H2O   # Need density for H2O to convert volume to mass
-(10 wt% NaCl // H2O)@1.07n # set density of a mixture
+5g NaCl // 50mL H2O@1           # volume components need density to determine mass fraction
+5g NaCl@2.16 // 50mL H2O@1      # need component densities to estimate mixture density
+NaCl(H2O)29.1966(D2O)122.794@1.10i  # mixture rendered as formula
+! 5g NaCl // 50mL H2O           # need density for H2O to convert volume to mass
+(10 wt% NaCl // H2O)@1.07n      # set density of a mixture
 50 mL (45 mL H2O@1 // 5 g NaCl)@1.0707 // 20 mL D2O@1n
 1 cm Si // 5 nm Cr // 10 nm Au
-aa:RELEELNVPGEIVESLSSSEESITRINKKIEKFQSEEQQQTEDELQDKIHPFAQTQSLVYPFPGPIPNSLPQNIPPLTQTPVVVPPFLQPEVMGVSKVKEAMAPKHKEMPFPKYPVEPFTESQSLTLTDVENLHLPLPLLQSWMHQPHQPLPPTVMFPPQSVLSLSQSKVLPVPQKAVPYPQRDMPIQAFLLYQEPVLGPVRGPFPIIV
+! 4 nm NaCl@2.17// 50 g Si      # can't use mass in layer mixture
+! 3..5 mg NaCl                  # bad number format
+! 5 Mg NaCl // 50mL H2O@1       # bad units
+! 3.5 fm Si                     # bad units; expecting wt%/vol% or LENGTH, VOLUME, MASS
+! 3.5 mm Si // 2.5 nm SiO2 //   # missing final component of mixture
+! 3.5 mm Si // 2.5 nm SiO2 // 35 mm cG      # bad final component of mixture
+! // 3g Ca                      # // is not a comment
+! 37 vol% H2O@1 / 5% D2O@1      # missing /
+! 37 vol% H2O@1 /// 5% D2O@1    # extra /
+! 37 vol% NaCl@2.16 // H2O@1 // D2O@1  # percent missing in middle part
+! 37 vol% H2O@1 // 5% D2O@1     # percent not allowed in last part
+! 37 vol% H2O@1 // 5 vol% D2O@1 # only % in subsequent parts
+! 37% H2O@1 // D2O@1            # missing vol% or wt%
+! 37 val% H2O@1 // D2O@1        # bad spelling of vol%
 
-! Bl2Oh   # Bad symbol
-! 5 Mg NaCl // 50mL H2O@1  # Bad units
-! 4 nm NaCl@2.17// 50 g Si  # Can't use mass in layer mixture
+# === FASTA tests ===
+dna:CAGT
+dna:CAGT @1n  # can override the density of a FASTA sequence
+aa:RELEELNVPGEIVESLSSSEESITRINKKIEKFQSEEQQQTEDELQDKIHPFAQTQSLVYPFPGPIPNSLPQNIPPLTQTPVVVPPFLQPEVMGVSKVKEAMAPKHKEMPFPKYPVEPFTESQSLTLTDVENLHLPLPLLQSWMHQPHQPLPPTVMFPPQSVLSLSQSKVLPVPQKAVPYPQRDMPIQAFLLYQEPVLGPVRGPFPIIV
+! DNA:CAGT    # incorrect case for FASTA type
+! dna CAGT    # missing colon between FASTA type and sequence
+! bad:CAGT    # bad FASTA sequence type
 
 """
 
@@ -838,7 +858,7 @@ def check():
         bad = line.startswith('!')
         if bad:
             formula = formula[1:]
-        if formula:
+        if formula.strip():
             print()
             if bad:
                 print(f"!!! {line[1:]}")
@@ -862,6 +882,8 @@ def check():
                     continue  # pyparsing should fail but doesn't
                 if bad:
                     raise RuntimeError(f"Exception not raised for <{formula}>")
+        else:
+            print(line)
 
 def main():
     import sys