From 1cec0b90e4173e05003e00b0c73e0d59f0f67b25 Mon Sep 17 00:00:00 2001 From: Paul Kienzle Date: Thu, 5 Mar 2026 15:45:58 -0500 Subject: [PATCH 01/19] Explore lark as replacement for the pyparsing formula parser --- explore/lark_parse.py | 712 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 712 insertions(+) create mode 100644 explore/lark_parse.py diff --git a/explore/lark_parse.py b/explore/lark_parse.py new file mode 100644 index 0000000..aea2724 --- /dev/null +++ b/explore/lark_parse.py @@ -0,0 +1,712 @@ +import lark +import periodictable as pt +from periodictable.core import PeriodicTable +from periodictable.core import default_table +from periodictable.formulas import from_subscript, Formula, _mix_by_weight_pairs, _mix_by_volume_pairs +from periodictable.formulas import VOLUME_UNITS, MASS_UNITS, LENGTH_UNITS + +grammar = """ +start : SPACE? formula SPACE? # strip blank space from start and end +formula : compound | mixture + +# Mixture definitions: quantity compound // quantity compound // quantity compound +# Activation only cares about total mass, so you can freely mix masses and volumes if +# you have the density for each component. Scattering cares about density of the mixture, +# which in general is different from the mixture of densities. +# To convert layers to masses for activation estimates we need density. Also need to scale by +# area to convert density and thickness to mass. Assume unit area is cm^2, so for +# example "4 (5 nm Ni // 2 mm Si)" is a 4 cm^2 wafer of nickel on silicon. If you +# were to add a polymer you would need its density: "4 (20 nm C5H10@1.2 + +mixture : byamount | byvolume | byweight | layers +byamount : quantity compound (MIX quantity compound)* +byvolume : volumepct compound (MIX percentage compound)* MIX compound +byweight : weightpct compound (MIX percentage compound)* MIX compound +layers : thickness compound (MIX thickness compound)* +quantity : NUMBER SPACE? (MASS | VOLUME) SPACE +weightpct : NUMBER SPACE? WEIGHTPCT SPACE +volumepct : NUMBER SPACE? VOLUMEPCT SPACE +thickness : NUMBER SPACE? LENGTH SPACE +percentage : NUMBER SPACE? "%" SPACE # Allows "3 % " + +# Compound definition: number group ... @ density where group is El count El count ... +# FASTA sequences: (rna|dna|aa) : SEQUENCE @ density +# Density applies to the entire formula, such as "NaCl + 29.2H2O @ 1.07n" +# If you do this as a mixture you need parentheses: "(10 wt% NaCl // H2O)@1.07n" +# Note: `[token]` leaves a None placeholder in the tree, unlike `token?` +compound : (composite | fasta) [density] +fasta : FASTA ":" SEQUENCE +FASTA : /dna|rna|aa/ +SEQUENCE : /[A-Z -*]+/ +composite : [NUMBER] group (SEPARATOR [NUMBER] group)* +group : ((atom | "(" formula ")") [COUNT])+ +atom : SYMBOL [isotope] [charge] +# could list all elements, but better error reporting if element symbol lookup fails +SYMBOL : /[A-Z][a-z]*/ +isotope : "[" INTEGER "]" +charge : "{" [INTEGER] CHARGE "}" | [SUPERINT] SUPERCHARGE +density : SPACE? "@" SPACE? NUMBER [DENSITYMODE] + +# Tokens +CHARGE : /[+]+|[-]+/ # allow charge using {++} or {--} +SUPERCHARGE: /\u207A+|\u207B+/ # Allow Ca++ and Cl- using superscript + and - +DENSITYMODE: /[ni]/ +MIX : SPACE? "//" SPACE? +# maybe drop "wt%" and "vol%" +WEIGHTPCT : /%w((eigh)?t)?/ | /w((eigh)?t)?%/ | /%m(ass)?/ | /m(ass)?%/ +VOLUMEPCT : /%v(ol(ume)?)?/ | /v(ol(ume)?)?%/ +MASS : "kg" | "g" | "mg" | "ug" | "μg" | "ng" +VOLUME : "L" | "mL" | "uL" | "μL" | "nL" +LENGTH : "cm" | "mm" | "um" | "μm" | "nm" + +SEPARATOR : SPACE? /[+•·]/ SPACE? | SPACE +SPACE : /[ \\t\\n\\r]+/ +COUNT : NUMBER | SUBNUM # atom counts can be normal numbers or unicode subscripts +NUMBER : INTEGER | FRACTION +INTEGER : /[1-9][0-9]*/ +FRACTION : /([1-9][0-9]*|0)?[.][0-9]*/ # allow all floats? +SUBNUM : SUBINT | SUBFRAC +SUBINT : /(\u2080|[\u2081-\u2089][\u2080-\u2089]*)/ +SUBFRAC : /(\u2080|[\u2081-\u2089][\u2080-\u2089]*|)([.][\u2080-\u2089]*)/ +SUPERINT : /(\u2070|[\u00B9\u00B2\u00B3\u2074-\u2079][\u2070\u00B9\u00B2\u00B3\u2074-\u2079]*)/ +""" + +parser = lark.Lark(grammar) + +def from_superscript(value: str) -> str: + """ + Convert unicode superscript characters to normal characters. This allows us to parse, + for example, Ca²⁺ as Ca{2+}. + """ + codepoints = { + '\u2070': '0', '\u00B9': '1', '\u00B2': '2', '\u00B3': '3', + '\u2074': '4', '\u2075': '5', '\u2076': '6', '\u2077': '7', + '\u2078': '8', '\u2079': '9', '\u207a': '+', '\u207b': '-', + '\u207c': '=', '\u207d': '(', '\u207e': ')', + + '\u2071': 'i', '\u207f': 'n', + } + return ''.join(codepoints.get(char, char) for char in str(value)) + +def int_or_float(s): + f = float(s) + i = int(f) + return i if i == f else f + +class StripJunk(lark.Transformer): + """ + Token stripper visitor class. + + This is done separately from the formula composer so that we can show the cleaned tree + before debugging the conversion. + """ + def SEPARATOR(self, _): + """Strip token for molecular fragment separator (+ or center dot or spaces).""" + return lark.Discard + def MIX(self, _): + """Strip token for mixture separator //.""" + return lark.Discard + def SPACE(self, _): + """Strip token for (usually optional) spaces.""" + return lark.Discard + def WEIGHTPCT(self, _): + """Strip token for wt% mixture indicator.""" + return lark.Discard + def VOLUMEPCT(self, _): + """Strip token for vol% mixture indicator.""" + return lark.Discard + +class ConvertTokens(lark.Transformer): + """ + Syntax tree to formula conversion class. + """ + def __init__(self, text, table=None): + """ + *text* is the original formula string. + + *table* is an optional alternative periodic table. + """ + self._context = text + self._table = default_table(table) + + def VOLUME(self, token: lark.Token) -> tuple[str, str]: + """ + Convert VOLUME terminal ('volume', unit) pair. + + Unit is a volume unit, such as mL or uL for microlitres. + """ + return 'volume', token.value + def MASS(self, token: lark.Token) -> tuple[str, str]: + """ + Convert MASS terminal to ('mass', unit) pair. + + Unit is a mass unit, such as g or mg. + """ + return 'mass', token.value + def LENGTH(self, token: lark.Token) -> tuple[str, str]: + """ + Convert LENGTH terminal to ('length', unit) pair. + + Unit is a length unit, such as cm or nm. + """ + return 'length', token.value + def NUMBER(self, token: lark.Token) -> int|float: + """ + Convert string to float or integer. + + Numbers are used for quantities and percentages in mixtures, and for multiplier + counts to molecule fragments. + """ + return int_or_float(token.value) + def INTEGER(self, token: lark.Token) -> int: + """ + Convert string to float or integer + """ + return int(token.value) + def COUNT(self, token: lark.Token) -> int|float: + """ + Return the count value for a group component. + + Count is specified after the symbol, either as an ASCII number or using subscript digits. + The period separator for fractional counts uses ASCII in both cases (there is no subscript + period charcter available). If the count is fractional return it as a float, otherwise + return it as an integer. + """ + return int_or_float(from_subscript(token.value)) + def SUPERINT(self, token): + """ + Return the integer value of a sequence of superscript digits. + + This is used in the charge rule as part of the valence specification for the atom. + """ + return int(from_superscript(token.value)) + def DENSITYMODE(self, token): + """ + Return the value of the DENSITYMODE token, either "n" or "i". If no mode is specified + then a token value of None will be given to the density rule. + """ + return token.value + def CHARGE(self, token): + """ + Return a sequence of plus and minus characters. By grammar rules they must all have + the same sign. + + This is used in the charge rule as part of the valence specification for the atom. + """ + return token.value + def SUPERCHARGE(self, token): + """ + Convert sequence of superscript plus and minus characters to ASCII plus and minus. + + This is used in the charge rule as part of the valence specification for the atom. + """ + return from_superscript(token.value) + def SYMBOL(self, token): + """ + Look up the element in the periodic table and return it. + + Raise ValueError if the element doesn't exist. + """ + try: + return self._table.symbol(token.value) + except Exception: + raise ValueError(f"Element {token.value} doesn't exist") + def FASTA(self, token): + """ + Return the token value as the fasta sequence type: "dna", "rna" or "aa". + """ + return token.value + def SEQUENCE(self, token): + """ + Return the token value as the fasta sequence string. + """ + return token.value + def fasta(self, tokens): + """ + Return a fasta sequence and its type. + + Transform: [type, sequence] => ('fasta', type, sequence) + """ + stype, sequence = tokens + return 'fasta', stype, sequence + def isotope(self, tokens): + """ + Return the isotope number for the atom. + + Transform: [isotope] => isotope + """ + return tokens[0] + def charge(self, tokens): + """ + Return valence from number and sign. + + Valence is either a number followed by plus or minus, or a sequence of plus + or minus. If the number was specified it will already have been converted + to a value, otherwise use the length of the charge string as the value. + + The valence can be given using superscript or regular ASCII number and sign + symbols. If ASCII then they need to be wrapped in braces such as Ca{2+}. The + token transform handles the conversion from superscript to ASCII characters + and the conversion from string to number. + + Raise ValueError if a number was supplied along with multiple charge symbols. + + Transform: [number|None, 'charge'] => valence + + Example: ['{1+}'] => [1, '+'] = Ca.ion[1] + # Ca{++} => [None, '++'] = Ca.ion[2] + # Ca{3--} => [3, '--'] = Ca.ion[-3] # value has precedence over charge + """ + print("in charge with", tokens) + value, charge = tokens + if value is None: + value = len(charge) + elif value and len(charge) > 1: + self._raise_error(None, f"Using values of {value} for {value}{charge}") + valence = value if charge[0] == '+' else -value + return valence + def atom(self, tokens): + """ + Returns an atom from the periodic table. + + Usually this will use elements from the default table, but if an alternate table is + provided to the ConvertTokens constructor then that will be used to retrieve the element + from the symbol. + + Isotope and charge are optional. By using the rule "SYMBOL [isotope] [charge|supercharge]" + with "[opt]" for the optional components rather "opt?", the missing components appear + as None in the list of tokens. The "supercharge" option allows unicode superscripts to + be used to specify charge rather than curly braces "{charge}". + + Raises an error if the symbol does not exist, does not have that isotope or doesn't + allow that charge. + + Transform: ['symbol', isotope|None, charge|None] => atom + + Example: ['H', 1, 1] => H[1]{+} + + Example: ['Ca', None, 2] => Ca{2+} + """ + #print("atom", tokens) + el, iso, ion = tokens + if iso and ion: + atom = el[iso].ion[ion] + elif iso: + atom = el[iso] + elif ion: + atom = el.ion[ion] + else: + atom = el + #print(f"atom {tokens} => {atom}") + return atom + + def group(self, tokens): + """ + Returns a sequence of (count, item) pairs, where item is an atom or a nested formula. + Missing counts default to 1. + + Transform: [atom|formula, count|None, ...] => ((count, atom|formula), ...) + """ + tokens = [1 if value is None else value for value in tokens] + pairs = tuple((count, item) for item, count in zip(tokens[::2], tokens[1::2])) + return pairs + + def composite(self, tokens): + """ + Returns a sequence of (number, group) pairs. Each group is a sequence of (count, item) + pairs, where item is an atom or a nested formula. Missing numbers default to 1. + + Transform: [number|None, group, ...] => ((number, group), ...) | ((count, atom), ...) + + Example CaCO3 6H2O: None, ((1, Ca), (1, C), (3, O)), 6, ((2, H), (1, O))] + => ((1, ((1, Ca), (1, C), (3, O))), (6, ((2, H), (1, O)))) + + Example CaCO3(H20)6: [[None, ((1, Ca), (1, C), (3, O), (6, formula('H2O')))] + => ((1, Ca), (1, C), (3, O), (6, formula('H2O'))) + """ + # print("in composite", tokens) + numbers = [1 if v is None else v for v in tokens[::2]] + groups = tokens[1::2] + pairs = tuple((number, group) for number, group in zip(numbers, groups)) + return pairs + + def fasta(self, tokens): + """ + Returns the formula corresponding to the FASTA sequence, with the natural + density set. Labile hydrogen use H[1] in the formula. + + The extra level of nesting in the return value is so that the fasta structure + is like a composite with a single group containing a nested formula. + + Transform: [ /aa|dna|rna/, /[A-Z -*]+/ ] => (1, ((1, formula),)) + + Example dna:CAGT: ['dna', 'CAGT'] => (1, ((1, C39H37H[1]10N15O25P4@1.69),)) + """ + # TODO: fasta is ignoring table when parsing + # TODO: avoid circular imports + # TODO: support other biochemicals (carbohydrate residues, lipids) + from periodictable import fasta + + # print("in fasta", tokens) + seq_type, seq = tokens + if seq_type not in fasta.CODE_TABLES: + raise ValueError(f"Invalid fasta sequence type '{seq_type}:'") + seq = fasta.Sequence(name=None, sequence=seq, type=seq_type) + group = ((1, seq.labile_formula),) + composite = ((1, group),) + return composite + + def density(self, tokens): + """ + Returns a density tuple from the @density construct. Density mode 'n' for + natural or 'i' for isotopic defaults to isotopic. That is, D2O@1.11 is the + isotopic density of D2O, not the natural density of H2O with conversion to + the heavier deutrium isotope. + + Transform: [value, mode|None] => ('density', value, mode) + + Example @1.11: [1.11, None] => ('density', 1.11, 'i') + + Example @1.11i: [1.11, 'i'] => ('density', 1.11, 'i') + + Example @1n: [1, 'n'] => ('density', 1, 'n') + """ + value = tokens[0] + mode = 'i' if not tokens[1] else tokens[1] + return 'density', value, mode + + def compound(self, tokens): + """ + Returns the formula for the compound, with optional density set. + + Density is ('density', value, mode) or None, where mode is 'i' for isotopic density + or 'n' for natural density. + + The compound may come from a FASTA spec, such as dna:CAGT or from a composite, such + as CaCO3+6H2O. The composite may include an embedded formula, such as CaCO3(H2O)6. + In any case, the resulting material token will be a sequence of (multiplier, group) + pairs, where each group is a sequence of (count, item) pairs. Each item may be an + atom or a formula. The fasta transform returns a single group with a single item. + As a nested sequence this is ((1, ((1, formula), ...)), ...), with nothing in the + ellipses. + + Transform: [((number, group), ...), ('density', value, mode)|None] => formula + + Example NaCl@2.16i: [(1, ((1, Na), (1, Cl))), ('density', 2.16, 'i')] => NaCl@2.16i + + Example dna:CAGT: [((1, ((1, C39H37H[1]10N15O25P4@1.69n),)),), None] => C39H37H[1]10N15O25P4@1.69n + + Example CaCO3 6H2O: [((1, ((1, Ca), (1, C), (3, O))), (6, ((2, H), (1, O)))), None] => CaCO3(H2O)6 + + Example CaCO3(H20)6: [((1, ((1, Ca), (1, C), (3, O), (6, H2O@None))),), None] => CaCO3(H2O)6 + """ + # print("in compound with", tokens) + components, density_tuple = tokens + if density_tuple is None: + density, density_mode = None, 'i' + else: + _, density, density_mode = density_tuple + + # If a singleton formula with no density override then return it + # That is, [(1, ((1, formula),)), None] => formula + if density is None and len(components) == 1: + number, group = components[0] + if len(group) == 1 and number == 1: + count, item = group[0] + if count == 1 and isinstance(item, Formula): + # print("isolated formula with no density override") + return item + + # Not an isolated formula, so expand formulas within the groups. + # That is, [..., (number, (..., (count, formula), ...)), ...] + # becomes [..., (number, (..., (count, formula.structure), ...)), ...] + def expand_formula(group): + return tuple((count, getattr(item, 'structure', item)) for count, item in group) + components = tuple((number, expand_formula(group)) for number, group in components) + + # If it is a singleton group then use its structure as the formula structure. + if len(components) == 1 and components[0][0] == 1: + structure = components[0][1] + else: + structure = components + + # Build the formula and assign density if available. + # print("compound structure", structure) + formula = Formula(structure=structure) + if density is not None: + if density_mode == 'n': + formula.natural_density = density + else: + formula.density = density + + # print(f"compound = {formula} @ {formula.density}") + return formula + + def weightpct(self, tokens): + """ + Returns the percentage. The value has already be converted to a number. + + Used as the first percentage of a mix by weight mixture. + + Transform: [percent] => percent + + Example for "3 wt%": [3] => 3 + """ + return tokens[0] + + def volumepct(self, tokens): + """ + Returns the percentage. The value has already be converted to a number. + + Used as the first percentage of a mix by volume mixture. + + Transform: [percent] => percent + + Example for "3 vol%": [3] => 3 + """ + return tokens[0] + + def percentage(self, tokens): + """ + Returns the percentage. The value has already be converted to a number. + + Transform: [percent] => percent + + Example for " 3 % ": [3] => 3 + """ + return tokens[0] + + def byweight(self, tokens): + """ + Returns mixture by wt% of the various components in the system. + + Raises ValueError if total exceeds 100%. + + Transform: [percent, formula, ..., percent, formula, formula] => formula + + Example: [76.95, D2O, H2O] => (D2O)3H2O + """ + total = sum(tokens[:-1:2]) + if total > 100: + raise ValueError(f"Total weight {total}% is more than 100%") + pairs = [(compound, percent) for percent, compound in zip(tokens[:-1:2], tokens[1:-1:2])] + pairs.append((tokens[-1], 100-total)) + # return 'byweight', [*pairs, last_pair] + formula = _mix_by_weight_pairs(pairs) + # print(f"byweight => {formula} @ {formula.density}") + return formula + + def byvolume(self, tokens): + """ + Returns mixture by vol% of the various components in the system. Volumes are converted + to mass using density. + + Raises ValueError if the density is missing from a component formula. + Raises ValueError if total exceeds 100%. + + Transform: [percent, formula, ..., percent, formula, formula] => formula + + Example: [75.0, D2O@1n, H2O@1n] => (D2O)3H2O + """ + # print("by volume", tokens) + total = sum(tokens[:-1:2]) + if total > 100: + raise ValueError(f"Total volume {total}% is more than 100%") + pairs = [(compound, percent) for percent, compound in zip(tokens[:-1:2], tokens[1:-1:2])] + pairs.append((tokens[-1], 100-total)) + # print("byvolume pairs", pairs) + # print("byvolume density", [f.density for f, p in pairs]) + #return 'byvolume', pairs + formula = _mix_by_volume_pairs(pairs) + return formula + + def byamount(self, tokens): + """ + Returns mixture by mass of the various components in the system. Volumes are converted + to mass using density. + + Raises ValueError if the density is missing from a component formula. + + Transform: [quantity, formula, ...] => formula + + Example: [('mass', 5.07, 'g'), NaCl@2.16, ('volume', 50, 'mL'), H2O@1n] => NaCl(H2O)32 + """ + # print("byamount", tokens) + def find_value(quantity, formula): + qtype, value, units = quantity + if qtype == 'volume': + if formula.density is None: + raise ValueError(f"Need the mass density of {formula}") + mass = value * VOLUME_UNITS[units] * 1000.0 * formula.density + else: + mass = value * MASS_UNITS[units] + return mass + values = [find_value(q, f) for q, f in zip(tokens[::2], tokens[1::2])] + total = sum(values) + percent = [(m/total)*100 for m in values] + formula = _mix_by_weight_pairs(zip(tokens[1::2], percent)) + formula.total_mass = total + return formula + + def layers(self, tokens): + """ + Returns the mixture by volume of the various layers in the system. + + Raises ValueError if the density is missing from a component formula. + + Sets formula.thickness to the sum of the layer thicknesses. + + Transform: [quantity, formula, ...] => formula + + Example: [('length', 10.006, 'nm'), Ni, ('length', 3, 'mm'), Si] => NiSi164000 + """ + values = [value*LENGTH_UNITS[units] for dim, value, units in tokens[::2]] + total = sum(values) + percent = [(m/total)*100 for m in values] + formula = _mix_by_volume_pairs(zip(tokens[1::2], percent)) + formula.thickness = total + return formula + + def mixture(self, tokens): + """ + Returns the formula representing the mixture, either byweight, byvolume, byamount or layers + + Transform: [formula] => formula + """ + return tokens[0] + + def formula(self, tokens): + """ + Return the formula representing the compound or mixture. + + Transform: [formula] => formula + """ + return tokens[0] + + def thickness(self, tokens): + """ + Returns (dimension, value, unit) with dimension equal 'length' + + Transform: [value, ('length', unit)] => ('length', value, unit) + + Example: [5, ('length', 'nm')] => ('length', 5, 'nm') + """ + value, (dim, units) = tokens + return dim, value, units + + def quantity(self, tokens): + """ + Returns (dimension, value, unit) with dimension equal 'mass' or 'volume' + + Transform: [value, (dimension, unit)] => (dimension, value, unit) + + Example: [5, ('mass', 'g')] => ('mass', 5, 'g') + """ + value, (dim, units) = tokens + return dim, value, units + + def start(self, tokens): + """ + Return the final formula, with the original text attached. + + Sets formula.source to 'parse string' before returning. + + Transform: [formula] => formula + """ + formula = tokens[0] + # TODO: add the source string to the formula class attributes + # Remember the string which was parsed + formula.source = self._context + return formula + + +def parse_formula(formula_str: str, table: PeriodicTable|None=None) -> Formula: + """ + Parse a chemical formula, returning a structure with elements from the + given periodic table. + """ + cleanup = StripJunk() + convert = ConvertTokens(formula_str, table=table) + tree = parser(formula_str) + tree = cleanup.transform(tree) + tree = convert.transform(tree) + return tree + +examples = """ +Co +dna:CAGT +(Co@5) +(((Co@5)@6)) +CaCO3 +CaCO₃ +CaCO3+6H2O +CaCO3 6H2O +CaCO3(H2O)6 +CaCO3 (H2O)6 +(Ca(CO3)((H2O)6)) +CaCO₃·6H₂O +DHO +!Ca{2++} # could be interpreted as Ca{2+} +Ca⁺⁺ # also Ca{2+} +O²⁻ +H[1] +H2O@1 +D2O@1n +D2O @ 1.11 +D2O@1.11i +HO{1-} +H[1]{1-}O +H2SO4 +C3H4H[1]NO@1.29n +78.2H2O[16] + 21.8H2O[18] @1n +50 wt% Co // Ti +33 wt% Co // 33% Fe // Ti +! 93 wt% Co // 33% Fe // Ti # More than 100% +20 vol% (10 wt% NaCl@2.16 // H2O@1) // D2O@1n +NaCl(H2O)29.1966(D2O)122.794@1.10i +5g NaCl // 50mL H2O@1 +5g NaCl@2.16 // 50mL H2O@1 +50 mL (45 mL H2O@1 // 5 g NaCl)@1.0707 // 20 mL D2O@1n +1 cm Si // 5 nm Cr // 10 nm Au +aa:RELEELNVPGEIVESLSSSEESITRINKKIEKFQSEEQQQTEDELQDKIHPFAQTQSLVYPFPGPIPNSLPQNIPPLTQTPVVVPPFLQPEVMGVSKVKEAMAPKHKEMPFPKYPVEPFTESQSLTLTDVENLHLPLPLLQSWMHQPHQPLPPTVMFPPQSVLSLSQSKVLPVPQKAVPYPQRDMPIQAFLLYQEPVLGPVRGPFPIIV + +# Error conditions. Mark with '!' so the exception is ignored +! Bl2Oh +! 5 Mg NaCl // 50mL H2O@1 +! 4 nm NaCl@2.17// 50 g Si + +""" + +def check(): + cleanup = StripJunk() + def filt(tree): + #return tree + tree = cleanup.transform(tree) + # import pprint; pprint.pprint(tree) + tree = convert.transform(tree) + return tree + + for line in examples.split('\n'): + formula = line.split('#')[0] + bad = formula.startswith('!') + if bad: + formula = formula[1:] + if formula: + print(f"*** {line}") + convert = ConvertTokens(text=formula) + try: + tree = filt(parser.parse(formula)) + #print(f" => {tree.pretty()}") + density = getattr(tree, 'density', None) + density_str = f" @ {density:.2f}" if density else "" + print(f" => {tree}{density_str}") + # TODO: structure not preserved in mixtures + print(f" {getattr(tree, 'structure', None)}") + except Exception as exc: + if bad: + print(f"!!! Error: {exc}") + else: + raise + +if __name__ == "__main__": + check() \ No newline at end of file From e2b5aec25e4dbbe9e10c22dfa0e9c03de18f9bf7 Mon Sep 17 00:00:00 2001 From: Paul Kienzle Date: Thu, 5 Mar 2026 22:21:58 -0500 Subject: [PATCH 02/19] improve error display for lark parser --- explore/lark_parse.py | 180 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 146 insertions(+), 34 deletions(-) diff --git a/explore/lark_parse.py b/explore/lark_parse.py index aea2724..d773dcc 100644 --- a/explore/lark_parse.py +++ b/explore/lark_parse.py @@ -36,7 +36,8 @@ # Note: `[token]` leaves a None placeholder in the tree, unlike `token?` compound : (composite | fasta) [density] fasta : FASTA ":" SEQUENCE -FASTA : /dna|rna|aa/ +FASTA : /[a-z]+/ # Generic "str:sequence" syntax allows better error reporting +#FASTA : /dna|rna|aa/ SEQUENCE : /[A-Z -*]+/ composite : [NUMBER] group (SEPARATOR [NUMBER] group)* group : ((atom | "(" formula ")") [COUNT])+ @@ -45,10 +46,12 @@ SYMBOL : /[A-Z][a-z]*/ isotope : "[" INTEGER "]" charge : "{" [INTEGER] CHARGE "}" | [SUPERINT] SUPERCHARGE -density : SPACE? "@" SPACE? NUMBER [DENSITYMODE] +density : SPACE? "@" SPACE? DENSITY [DENSITYMODE] +DENSITY : NUMBER # using alias DENSITY for number for better error reporting # Tokens CHARGE : /[+]+|[-]+/ # allow charge using {++} or {--} +SUPERINT : /(\u2070|[\u00B9\u00B2\u00B3\u2074-\u2079][\u2070\u00B9\u00B2\u00B3\u2074-\u2079]*)/ SUPERCHARGE: /\u207A+|\u207B+/ # Allow Ca++ and Cl- using superscript + and - DENSITYMODE: /[ni]/ MIX : SPACE? "//" SPACE? @@ -57,7 +60,7 @@ VOLUMEPCT : /%v(ol(ume)?)?/ | /v(ol(ume)?)?%/ MASS : "kg" | "g" | "mg" | "ug" | "μg" | "ng" VOLUME : "L" | "mL" | "uL" | "μL" | "nL" -LENGTH : "cm" | "mm" | "um" | "μm" | "nm" +LENGTH : "cm" | "mm" | "um" | "μm" | "nm" | "Ang" | "Å" SEPARATOR : SPACE? /[+•·]/ SPACE? | SPACE SPACE : /[ \\t\\n\\r]+/ @@ -68,10 +71,10 @@ SUBNUM : SUBINT | SUBFRAC SUBINT : /(\u2080|[\u2081-\u2089][\u2080-\u2089]*)/ SUBFRAC : /(\u2080|[\u2081-\u2089][\u2080-\u2089]*|)([.][\u2080-\u2089]*)/ -SUPERINT : /(\u2070|[\u00B9\u00B2\u00B3\u2074-\u2079][\u2070\u00B9\u00B2\u00B3\u2074-\u2079]*)/ """ -parser = lark.Lark(grammar) +# propagate_positions saves start_pos and end_pos for each rule as well as each terminal. +formula_parser = lark.Lark(grammar, propagate_positions=True) def from_superscript(value: str) -> str: """ @@ -99,6 +102,9 @@ class StripJunk(lark.Transformer): This is done separately from the formula composer so that we can show the cleaned tree before debugging the conversion. + + Note: could get the same effect by renaming the unused terminals with leading underscore, + but that makes the grammar harder to read. """ def SEPARATOR(self, _): """Strip token for molecular fragment separator (+ or center dot or spaces).""" @@ -158,6 +164,7 @@ def NUMBER(self, token: lark.Token) -> int|float: counts to molecule fragments. """ return int_or_float(token.value) + DENSITY = NUMBER # We've aliased DENSITY and NUMBER in the grammar def INTEGER(self, token: lark.Token) -> int: """ Convert string to float or integer @@ -257,12 +264,12 @@ def charge(self, tokens): # Ca{++} => [None, '++'] = Ca.ion[2] # Ca{3--} => [3, '--'] = Ca.ion[-3] # value has precedence over charge """ - print("in charge with", tokens) + # print("in charge with", tokens) value, charge = tokens if value is None: value = len(charge) elif value and len(charge) > 1: - self._raise_error(None, f"Using values of {value} for {value}{charge}") + raise ValueError(f"Use {value}{charge[0]} instead of {value}{charge} for valence") valence = value if charge[0] == '+' else -value return valence def atom(self, tokens): @@ -486,9 +493,10 @@ def byweight(self, tokens): Example: [76.95, D2O, H2O] => (D2O)3H2O """ + # TODO: structure not preserved in mixtures total = sum(tokens[:-1:2]) if total > 100: - raise ValueError(f"Total weight {total}% is more than 100%") + raise ValueError(f"Total weight {total}% is more than 100% in wt% mixture") pairs = [(compound, percent) for percent, compound in zip(tokens[:-1:2], tokens[1:-1:2])] pairs.append((tokens[-1], 100-total)) # return 'byweight', [*pairs, last_pair] @@ -511,7 +519,7 @@ def byvolume(self, tokens): # print("by volume", tokens) total = sum(tokens[:-1:2]) if total > 100: - raise ValueError(f"Total volume {total}% is more than 100%") + raise ValueError(f"Total volume {total}% is more than 100% in vol% mixture") pairs = [(compound, percent) for percent, compound in zip(tokens[:-1:2], tokens[1:-1:2])] pairs.append((tokens[-1], 100-total)) # print("byvolume pairs", pairs) @@ -560,6 +568,9 @@ def layers(self, tokens): Example: [('length', 10.006, 'nm'), Ni, ('length', 3, 'mm'), Si] => NiSi164000 """ + # # Sanity check: make sure all units are length units. This won't happen + # # because the parser only accepts proper formulas. + # assert all(units in LENGTH_UNITS for dim, value, units in tokens[::2]) values = [value*LENGTH_UNITS[units] for dim, value, units in tokens[::2]] total = sum(values) percent = [(m/total)*100 for m in values] @@ -619,6 +630,61 @@ def start(self, tokens): formula.source = self._context return formula +# TODO: improve error reporting for "allowed" +def _allowed(allowed): + # * SPACE, SEPARATOR: Generally ignored + # * LPAR occurs whereever a symbol could be expected, so skip it + # * COLON: If asking then it probably thinks it is looking for a fasta sequence, but + # instead it should be looking for an element, so replace COLON with SYMBOL. + # * AT: Looking for @DENSITY + # * LPAR, RPAR: "(" and ")" are more readable + # * LSQB: end of element, looking for isotope, so skip + # * LBRACE, SUPERINT, SUPERCHARGE: end of element, looking for valence, so skip + skip = set("SPACE SEPARATOR LPAR LSQB LBRACE SUPERINT SUPERCHARGE".split()) + # TODO: use order of elements in subst to sort the allowed list (currently alphabetical) + subst = dict( + NUMBER="NUMBER", # start of compound or start of mixture + #FASTA="[dna|rna|aa]:SEQ", + FASTA="aa:SEQ", + COLON="aa:SEQ", + SEQUENCE="aa:SEQ", + SEPARATOR="+", # generic group separator in composite + SPACE="SPACE", + SYMBOL="SYMBOL", + CHARGE="CHARGE[+-]", + LPAR='(', + RPAR=')', + LSQB='[', + RSQB=']', + LBRACE='{', # equivalent to SUPERINT and SUPERCHARGE + RBRACE='}', + VOLUMEPCT="vol%", + WEIGHTPCT="wt%", + MASS="UNIT[mg]", + VOLUME="UNIT[mL]", + LENGTH="UNIT[mm]", + PERCENT="%", + # I don't think all three of these can be concurrently allowed so no need to + # deduplicate. Moot since the set operation happens again after substition below. + AT="@DENSITY[ni]", # only the @ is expected, but better for doc + DENSITY="@DENSITY[ni]", # only the number is expected, but better for doc + DENSITYMODE="@DENSITY[ni]", # only the [ni] is expected, but better for doc + MIX="//", + # SUBNUM SUBINT SUBFRAC covered by COUNT + # INTEGER and FRACTION covered by NUMBER + # SUPERINT SUPERCHARGE LSQB LBRACE coexist with COUNT so stripped + SUPERCHARGE="SUPERSCRIPT[+-]", # If you see a superscript number then you need a sign + ) + stripped = set(s for s in allowed if s not in skip) + if not stripped: + stripped = allowed + # Perform substitution for document strings + stripped = set(subst.get(s, s) for s in stripped) + if len(stripped) > 1: + message = f"one of {' '.join(sorted(stripped))}" + else: + message = [*stripped][0] + return message def parse_formula(formula_str: str, table: PeriodicTable|None=None) -> Formula: """ @@ -627,12 +693,62 @@ def parse_formula(formula_str: str, table: PeriodicTable|None=None) -> Formula: """ cleanup = StripJunk() convert = ConvertTokens(formula_str, table=table) - tree = parser(formula_str) + try: + tree = formula_parser.parse(formula_str) + except lark.exceptions.UnexpectedCharacters as exc: + # import pprint; pprint.pprint(exc.__dict__) + context = exc.get_context(formula_str).rstrip() + #context = exc._context.rstrip() + message = f"Expected {_allowed(exc.allowed)} in\n{context}" + raise ValueError(message) + except lark.exceptions.UnexpectedEOF as exc: + # import pprint; pprint.pprint(exc.__dict__) + context = exc.get_context(formula_str).rstrip() + message = f"Expected {_allowed(exc.expected)} in\n{context}" + raise ValueError(message) + except Exception as exc: + # TODO: are other exceptions possible from the Earley parser? + raise exc from None tree = cleanup.transform(tree) - tree = convert.transform(tree) + try: + tree = convert.transform(tree) + except lark.exceptions.VisitError as exc: + # Unwind the VistorError exception capture and reraise the original exception + # This requires that error messages in the transformer give enough context to + # correct the error. + raise exc.orig_exc from None return tree +# Error conditions are marked with '!' so the exception is ignored examples = """ +! DNA:CAGT # incorrect case for FASTA type not properly identified +! dna CAGT # missing colon in FASTA +! O² # SUPERCHARGE should be the only valid token here +! ₃H2O # badly placed subscript +! // 3g Ca # // is not a comment +! 3g Ca@ // 5g Si # missing density value +! Ca@i # missing density value +! Ca ⁺⁺ # extra space before valence +! Ca++ # missing braces in valence +! Ca{2} # missing charge in valence +! 37 vol% H2O@1 / 5% D2O@1 # missing / +! 37 vol% H2O@1 /// 5% D2O@1 # extra / +! H2O@1h # bad density mode +! 37 vol% H2O@1 // 5% D2O@1 # no percent in last part +! 37 vol% H2O@1 // 5 vol% D2O@1 # only % in subsequent parts +! 37% H2O@1 // D2O@1 # missing vol% or wt% +! 37 val% H2O@1 // D2O@1 # bad spelling of vol% +! Fe[56O2 # bad isotope syntax +! Co[181] # bad isotope +! Ca{2+O2 # bad valence syntax +! Co{17-} # bad valence +! 3..5 mg NaCl +! 3.5 fm Si # bad units at the start; could be wt%/vol% or LENGTH, VOLUME, MASS +! 3.5 mm Si // 2.5 nm SiO2 // +! 3.5 mm Si // 2.5 nm SiO2 // 35 mm cG +! ((Co) # mismatched LPAR +! Co) # mismatched RPAR +! bad:CAGT # bad sequence type Co dna:CAGT (Co@5) @@ -646,7 +762,7 @@ def parse_formula(formula_str: str, table: PeriodicTable|None=None) -> Formula: (Ca(CO3)((H2O)6)) CaCO₃·6H₂O DHO -!Ca{2++} # could be interpreted as Ca{2+} +!Ca{2++} # bad valence string Ca⁺⁺ # also Ca{2+} O²⁻ H[1] @@ -661,52 +777,48 @@ def parse_formula(formula_str: str, table: PeriodicTable|None=None) -> Formula: 78.2H2O[16] + 21.8H2O[18] @1n 50 wt% Co // Ti 33 wt% Co // 33% Fe // Ti -! 93 wt% Co // 33% Fe // Ti # More than 100% +! 93 wt% Co // 33% Fe // Ti # More than 100 wt% +! 93 vol% Co // 33% Fe // Ti # More than 100 vol% 20 vol% (10 wt% NaCl@2.16 // H2O@1) // D2O@1n NaCl(H2O)29.1966(D2O)122.794@1.10i 5g NaCl // 50mL H2O@1 5g NaCl@2.16 // 50mL H2O@1 +! 5g NaCl // 50mL H2O # Need density for H2O to convert volume to mass 50 mL (45 mL H2O@1 // 5 g NaCl)@1.0707 // 20 mL D2O@1n 1 cm Si // 5 nm Cr // 10 nm Au aa:RELEELNVPGEIVESLSSSEESITRINKKIEKFQSEEQQQTEDELQDKIHPFAQTQSLVYPFPGPIPNSLPQNIPPLTQTPVVVPPFLQPEVMGVSKVKEAMAPKHKEMPFPKYPVEPFTESQSLTLTDVENLHLPLPLLQSWMHQPHQPLPPTVMFPPQSVLSLSQSKVLPVPQKAVPYPQRDMPIQAFLLYQEPVLGPVRGPFPIIV -# Error conditions. Mark with '!' so the exception is ignored -! Bl2Oh -! 5 Mg NaCl // 50mL H2O@1 -! 4 nm NaCl@2.17// 50 g Si +! Bl2Oh # Bad symbol +! 5 Mg NaCl // 50mL H2O@1 # Bad units +! 4 nm NaCl@2.17// 50 g Si # Can't use mass in layer mixture """ def check(): - cleanup = StripJunk() - def filt(tree): - #return tree - tree = cleanup.transform(tree) - # import pprint; pprint.pprint(tree) - tree = convert.transform(tree) - return tree - for line in examples.split('\n'): formula = line.split('#')[0] - bad = formula.startswith('!') + bad = line.startswith('!') if bad: formula = formula[1:] if formula: - print(f"*** {line}") - convert = ConvertTokens(text=formula) + if bad: + print(f"!!! {line[1:]}") + else: + print(f"*** {line}") try: - tree = filt(parser.parse(formula)) - #print(f" => {tree.pretty()}") + tree = parse_formula(formula) density = getattr(tree, 'density', None) density_str = f" @ {density:.2f}" if density else "" print(f" => {tree}{density_str}") - # TODO: structure not preserved in mixtures - print(f" {getattr(tree, 'structure', None)}") + # print(f" {getattr(tree, 'structure', None)}") except Exception as exc: if bad: - print(f"!!! Error: {exc}") + print(f"{exc}") else: - raise + raise exc from None + else: + if bad: + raise RuntimeError(f"Exception not raised for <{formula}>") if __name__ == "__main__": check() \ No newline at end of file From c3c0ced7ea5c29ed1428a0e093db4744b64b3226 Mon Sep 17 00:00:00 2001 From: Paul Kienzle Date: Thu, 5 Mar 2026 22:22:31 -0500 Subject: [PATCH 03/19] better formatting of invalid valence error --- periodictable/core.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/periodictable/core.py b/periodictable/core.py index 534a258..9c8e496 100644 --- a/periodictable/core.py +++ b/periodictable/core.py @@ -398,9 +398,9 @@ def __init__(self, element_or_isotope: Element|Isotope): def __getitem__(self, charge: int) -> Ion: if charge not in self.ionset: if charge not in self.element_or_isotope.ions: - raise ValueError("%(charge)d is not a valid charge for %(symbol)s" - % dict(charge=charge, - symbol=self.element_or_isotope.symbol)) + valence = f"{abs(charge)}{'+' if charge > 0 else '-'}" + symbol = self.element_or_isotope.symbol + raise ValueError(f"valence {valence} is not valid for {symbol}") self.ionset[charge] = Ion(self.element_or_isotope, charge) return self.ionset[charge] From a31ce4649381197995d6df382dbabf0c11add9c1 Mon Sep 17 00:00:00 2001 From: Paul Kienzle Date: Thu, 5 Mar 2026 22:23:19 -0500 Subject: [PATCH 04/19] Allow Ang when defining layered samples --- periodictable/formulas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/periodictable/formulas.py b/periodictable/formulas.py index f731c93..adfe916 100644 --- a/periodictable/formulas.py +++ b/periodictable/formulas.py @@ -712,7 +712,7 @@ def _isotope_substitution(compound: "Formula", source: Atom, target: Atom, porti # TODO: Grammar should be independent of table # TODO: Parser can't handle meters as 'm' because it conflicts with the milli prefix -LENGTH_UNITS = {'nm': 1e-9, 'um': 1e-6, 'μm': 1e-6, 'mm': 1e-3, 'cm': 1e-2} +LENGTH_UNITS = {'nm': 1e-9, 'um': 1e-6, 'μm': 1e-6, 'mm': 1e-3, 'cm': 1e-2, 'Ang': 1e-10, 'Å': 1e-10} MASS_UNITS = {'ng': 1e-9, 'ug': 1e-6, 'mg': 1e-3, 'g': 1e+0, 'kg': 1e+3} VOLUME_UNITS = {'nL': 1e-9, 'uL': 1e-6, 'mL': 1e-3, 'L': 1e+0} LENGTH_RE = '('+'|'.join(LENGTH_UNITS.keys())+')' From 13901325e8cef108ea189a23648ed6eb2a827304 Mon Sep 17 00:00:00 2001 From: Paul Kienzle Date: Thu, 5 Mar 2026 23:50:38 -0500 Subject: [PATCH 05/19] check failure text in pyparsing --- explore/lark_parse.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/explore/lark_parse.py b/explore/lark_parse.py index d773dcc..cb226bd 100644 --- a/explore/lark_parse.py +++ b/explore/lark_parse.py @@ -720,6 +720,7 @@ def parse_formula(formula_str: str, table: PeriodicTable|None=None) -> Formula: return tree # Error conditions are marked with '!' so the exception is ignored +# Lines marked ## fail on the existing parser examples = """ ! DNA:CAGT # incorrect case for FASTA type not properly identified ! dna CAGT # missing colon in FASTA @@ -727,7 +728,7 @@ def parse_formula(formula_str: str, table: PeriodicTable|None=None) -> Formula: ! ₃H2O # badly placed subscript ! // 3g Ca # // is not a comment ! 3g Ca@ // 5g Si # missing density value -! Ca@i # missing density value +! Ca@i # missing density value ## ! Ca ⁺⁺ # extra space before valence ! Ca++ # missing braces in valence ! Ca{2} # missing charge in valence @@ -751,8 +752,8 @@ def parse_formula(formula_str: str, table: PeriodicTable|None=None) -> Formula: ! bad:CAGT # bad sequence type Co dna:CAGT -(Co@5) -(((Co@5)@6)) +(Co@5) ## +(((Co@5)@6)) ## CaCO3 CaCO₃ CaCO3+6H2O @@ -760,15 +761,15 @@ def parse_formula(formula_str: str, table: PeriodicTable|None=None) -> Formula: CaCO3(H2O)6 CaCO3 (H2O)6 (Ca(CO3)((H2O)6)) -CaCO₃·6H₂O +CaCO₃·6H₂O ## DHO !Ca{2++} # bad valence string -Ca⁺⁺ # also Ca{2+} -O²⁻ +Ca⁺⁺ # also Ca{2+} ## +O²⁻ ## H[1] H2O@1 D2O@1n -D2O @ 1.11 +D2O @ 1.11 ## D2O@1.11i HO{1-} H[1]{1-}O @@ -806,7 +807,9 @@ def check(): else: print(f"*** {line}") try: + # Toggle the following to test pyparsing vs lark tree = parse_formula(formula) + #tree = pt.formula(formula) if "##" not in line else "!!! pyparsing fails" density = getattr(tree, 'density', None) density_str = f" @ {density:.2f}" if density else "" print(f" => {tree}{density_str}") @@ -817,6 +820,7 @@ def check(): else: raise exc from None else: + if '##' in line: continue # pyparsing should fail but doesn't if bad: raise RuntimeError(f"Exception not raised for <{formula}>") From ee2fc986e6e07a0fb613c997571746b2cd51d4ce Mon Sep 17 00:00:00 2001 From: Paul Kienzle Date: Fri, 6 Mar 2026 14:19:00 -0500 Subject: [PATCH 06/19] display and parse unicode superscripts for isotopes --- explore/lark_parse.py | 112 +++++++++++++++++++------------ periodictable/formulas.py | 134 ++++++++++++++++++++++++++------------ 2 files changed, 162 insertions(+), 84 deletions(-) diff --git a/explore/lark_parse.py b/explore/lark_parse.py index cb226bd..40ae090 100644 --- a/explore/lark_parse.py +++ b/explore/lark_parse.py @@ -2,8 +2,15 @@ import periodictable as pt from periodictable.core import PeriodicTable from periodictable.core import default_table -from periodictable.formulas import from_subscript, Formula, _mix_by_weight_pairs, _mix_by_volume_pairs -from periodictable.formulas import VOLUME_UNITS, MASS_UNITS, LENGTH_UNITS +from periodictable.formulas import ( + from_subscript, from_superscript, + Formula, + _mix_by_weight_pairs, _mix_by_volume_pairs, + VOLUME_UNITS, MASS_UNITS, LENGTH_UNITS, + pretty as pretty_formula +) + +# TODO: valence belongs to a group rather than element grammar = """ start : SPACE? formula SPACE? # strip blank space from start and end @@ -40,17 +47,18 @@ #FASTA : /dna|rna|aa/ SEQUENCE : /[A-Z -*]+/ composite : [NUMBER] group (SEPARATOR [NUMBER] group)* -group : ((atom | "(" formula ")") [COUNT])+ -atom : SYMBOL [isotope] [charge] +group : ((atom | isoatom | "(" formula ")") [COUNT])+ +atom : SYMBOL [isotope] [valence] +isoatom : SUPERINT SYMBOL [valence] # could list all elements, but better error reporting if element symbol lookup fails SYMBOL : /[A-Z][a-z]*/ isotope : "[" INTEGER "]" -charge : "{" [INTEGER] CHARGE "}" | [SUPERINT] SUPERCHARGE +valence : "{" [INTEGER] CHARGE "}" | [SUPERINT] SUPERCHARGE density : SPACE? "@" SPACE? DENSITY [DENSITYMODE] DENSITY : NUMBER # using alias DENSITY for number for better error reporting # Tokens -CHARGE : /[+]+|[-]+/ # allow charge using {++} or {--} +CHARGE : /[+]+|[-]+/ # allow valence using {++} or {--} SUPERINT : /(\u2070|[\u00B9\u00B2\u00B3\u2074-\u2079][\u2070\u00B9\u00B2\u00B3\u2074-\u2079]*)/ SUPERCHARGE: /\u207A+|\u207B+/ # Allow Ca++ and Cl- using superscript + and - DENSITYMODE: /[ni]/ @@ -76,21 +84,6 @@ # propagate_positions saves start_pos and end_pos for each rule as well as each terminal. formula_parser = lark.Lark(grammar, propagate_positions=True) -def from_superscript(value: str) -> str: - """ - Convert unicode superscript characters to normal characters. This allows us to parse, - for example, Ca²⁺ as Ca{2+}. - """ - codepoints = { - '\u2070': '0', '\u00B9': '1', '\u00B2': '2', '\u00B3': '3', - '\u2074': '4', '\u2075': '5', '\u2076': '6', '\u2077': '7', - '\u2078': '8', '\u2079': '9', '\u207a': '+', '\u207b': '-', - '\u207c': '=', '\u207d': '(', '\u207e': ')', - - '\u2071': 'i', '\u207f': 'n', - } - return ''.join(codepoints.get(char, char) for char in str(value)) - def int_or_float(s): f = float(s) i = int(f) @@ -184,7 +177,7 @@ def SUPERINT(self, token): """ Return the integer value of a sequence of superscript digits. - This is used in the charge rule as part of the valence specification for the atom. + This is used to specify the valence or to specify the isotope. """ return int(from_superscript(token.value)) def DENSITYMODE(self, token): @@ -198,14 +191,14 @@ def CHARGE(self, token): Return a sequence of plus and minus characters. By grammar rules they must all have the same sign. - This is used in the charge rule as part of the valence specification for the atom. + This is used in the valence rule to specify the charge for the atom. """ return token.value def SUPERCHARGE(self, token): """ Convert sequence of superscript plus and minus characters to ASCII plus and minus. - This is used in the charge rule as part of the valence specification for the atom. + This is used in the valence rule to specify the charge for the atom. """ return from_superscript(token.value) def SYMBOL(self, token): @@ -243,7 +236,7 @@ def isotope(self, tokens): Transform: [isotope] => isotope """ return tokens[0] - def charge(self, tokens): + def valence(self, tokens): """ Return valence from number and sign. @@ -261,10 +254,12 @@ def charge(self, tokens): Transform: [number|None, 'charge'] => valence Example: ['{1+}'] => [1, '+'] = Ca.ion[1] - # Ca{++} => [None, '++'] = Ca.ion[2] - # Ca{3--} => [3, '--'] = Ca.ion[-3] # value has precedence over charge + + Example: Ca{++} => [None, '++'] = Ca.ion[2] + + Example: Ca{3--} => ValueError """ - # print("in charge with", tokens) + # print("in valence with", tokens) value, charge = tokens if value is None: value = len(charge) @@ -280,15 +275,10 @@ def atom(self, tokens): provided to the ConvertTokens constructor then that will be used to retrieve the element from the symbol. - Isotope and charge are optional. By using the rule "SYMBOL [isotope] [charge|supercharge]" - with "[opt]" for the optional components rather "opt?", the missing components appear - as None in the list of tokens. The "supercharge" option allows unicode superscripts to - be used to specify charge rather than curly braces "{charge}". - Raises an error if the symbol does not exist, does not have that isotope or doesn't - allow that charge. + allow that valence. - Transform: ['symbol', isotope|None, charge|None] => atom + Transform: ['symbol', isotope|None, valence|None] => atom Example: ['H', 1, 1] => H[1]{+} @@ -307,6 +297,28 @@ def atom(self, tokens): #print(f"atom {tokens} => {atom}") return atom + def isoatom(self, tokens): + """ + Returns an isotope from the periodic table. + + Usually this will use elements from the default table, but if an alternate table is + provided to the ConvertTokens constructor then that will be used to retrieve the element + from the symbol. + + Raises an error if the symbol does not exist, does not have that isotope or doesn't + allow that valence. + + Transform: [isotope, 'symbol', valence|None] => atom + + Example ²H⁺: [2, 'H', 1] => D{+} + """ + # print("isoatom", tokens) + iso, el, ion = tokens + atom = el[iso].ion[ion] if ion else el[iso] + # print(f"isoatom {tokens} => {atom}") + return atom + + def group(self, tokens): """ Returns a sequence of (count, item) pairs, where item is an atom or a nested formula. @@ -630,7 +642,7 @@ def start(self, tokens): formula.source = self._context return formula -# TODO: improve error reporting for "allowed" +# TODO: if the next character is ":" then report error as bad fasta sequence type def _allowed(allowed): # * SPACE, SEPARATOR: Generally ignored # * LPAR occurs whereever a symbol could be expected, so skip it @@ -646,7 +658,8 @@ def _allowed(allowed): NUMBER="NUMBER", # start of compound or start of mixture #FASTA="[dna|rna|aa]:SEQ", FASTA="aa:SEQ", - COLON="aa:SEQ", + COLON=":", + #COLON="aa:SEQ", SEQUENCE="aa:SEQ", SEPARATOR="+", # generic group separator in composite SPACE="SPACE", @@ -682,8 +695,12 @@ def _allowed(allowed): stripped = set(subst.get(s, s) for s in stripped) if len(stripped) > 1: message = f"one of {' '.join(sorted(stripped))}" - else: + elif stripped: message = [*stripped][0] + else: + # This occurs when the middle part of percent mixtures have no percentage. + # We could look for '//' in the string to report a better error message. + message = "end of formula" return message def parse_formula(formula_str: str, table: PeriodicTable|None=None) -> Formula: @@ -696,7 +713,7 @@ def parse_formula(formula_str: str, table: PeriodicTable|None=None) -> Formula: try: tree = formula_parser.parse(formula_str) except lark.exceptions.UnexpectedCharacters as exc: - # import pprint; pprint.pprint(exc.__dict__) + #import pprint; pprint.pprint(exc.__dict__) context = exc.get_context(formula_str).rstrip() #context = exc._context.rstrip() message = f"Expected {_allowed(exc.allowed)} in\n{context}" @@ -730,12 +747,14 @@ def parse_formula(formula_str: str, table: PeriodicTable|None=None) -> Formula: ! 3g Ca@ // 5g Si # missing density value ! Ca@i # missing density value ## ! Ca ⁺⁺ # extra space before valence -! Ca++ # missing braces in valence +! Ca++ # missing braces in valence: the + is acting as SEPARATOR +! Ca2+ # missing braces in valence: the 2 is acting as COUNT and the + as SEPARATOR ! Ca{2} # missing charge in valence ! 37 vol% H2O@1 / 5% D2O@1 # missing / ! 37 vol% H2O@1 /// 5% D2O@1 # extra / ! H2O@1h # bad density mode -! 37 vol% H2O@1 // 5% D2O@1 # no percent in last part +! 37 vol% NaCl@2.16 // H2O@1 // D2O@1 # percent missing in middle part +! 37 vol% H2O@1 // 5% D2O@1 # percent not allowed in last part ! 37 vol% H2O@1 // 5 vol% D2O@1 # only % in subsequent parts ! 37% H2O@1 // D2O@1 # missing vol% or wt% ! 37 val% H2O@1 // D2O@1 # bad spelling of vol% @@ -767,6 +786,10 @@ def parse_formula(formula_str: str, table: PeriodicTable|None=None) -> Formula: Ca⁺⁺ # also Ca{2+} ## O²⁻ ## H[1] +²H⁺ # D{+} ## +O²H⁻ # OD{-} ## +O²⁻H⁺ # O{2-}H{+} ## +O²⁻²H⁺ # O{2-}D{+} ## H2O@1 D2O@1n D2O @ 1.11 ## @@ -775,7 +798,8 @@ def parse_formula(formula_str: str, table: PeriodicTable|None=None) -> Formula: H[1]{1-}O H2SO4 C3H4H[1]NO@1.29n -78.2H2O[16] + 21.8H2O[18] @1n +78.2H2O[16] + 21.8H2O[18] @1n # density applies to composite +dna:CAGT @1n # fasta density override 50 wt% Co // Ti 33 wt% Co // 33% Fe // Ti ! 93 wt% Co // 33% Fe // Ti # More than 100 wt% @@ -785,6 +809,7 @@ def parse_formula(formula_str: str, table: PeriodicTable|None=None) -> Formula: 5g NaCl // 50mL H2O@1 5g NaCl@2.16 // 50mL H2O@1 ! 5g NaCl // 50mL H2O # Need density for H2O to convert volume to mass +(10 wt% NaCl // H2O)@1.07n # set density of a mixture 50 mL (45 mL H2O@1 // 5 g NaCl)@1.0707 // 20 mL D2O@1n 1 cm Si // 5 nm Cr // 10 nm Au aa:RELEELNVPGEIVESLSSSEESITRINKKIEKFQSEEQQQTEDELQDKIHPFAQTQSLVYPFPGPIPNSLPQNIPPLTQTPVVVPPFLQPEVMGVSKVKEAMAPKHKEMPFPKYPVEPFTESQSLTLTDVENLHLPLPLLQSWMHQPHQPLPPTVMFPPQSVLSLSQSKVLPVPQKAVPYPQRDMPIQAFLLYQEPVLGPVRGPFPIIV @@ -812,7 +837,8 @@ def check(): #tree = pt.formula(formula) if "##" not in line else "!!! pyparsing fails" density = getattr(tree, 'density', None) density_str = f" @ {density:.2f}" if density else "" - print(f" => {tree}{density_str}") + mode = 'unicode' # unicode latex html plain + print(f" => {pretty_formula(tree, mode)}{density_str}") # print(f" {getattr(tree, 'structure', None)}") except Exception as exc: if bad: diff --git a/periodictable/formulas.py b/periodictable/formulas.py index adfe916..9e697ff 100644 --- a/periodictable/formulas.py +++ b/periodictable/formulas.py @@ -1073,19 +1073,6 @@ def _convert_to_hill_notation(atoms: dict[Atom, float]) -> Structure: """ return tuple((atoms[el], el) for el in sorted(atoms.keys(), key=_hill_key)) -def _str_one_atom(fragment: Atom) -> str: - # Normal isotope string form is #-Yy, but we want Yy[#] - if isisotope(fragment) and 'symbol' not in fragment.__dict__: - ret = "%s[%d]"%(fragment.symbol, cast(Isotope, fragment).isotope) - else: - ret = fragment.symbol - if fragment.charge != 0: - sign = '+' if fragment.charge > 0 else '-' - value = str(abs(fragment.charge)) if abs(fragment.charge) > 1 else '' - ret += '{'+value+sign+'}' - return ret - -# TODO: add typing to _str_atoms def _str_atoms(seq) -> str: """ Convert formula structure to string. @@ -1094,7 +1081,7 @@ def _str_atoms(seq) -> str: ret = "" for count, fragment in seq: if isatom(fragment): - ret += _str_one_atom(fragment) + ret += str(fragment) if count != 1: ret += "%g"%count else: @@ -1113,7 +1100,7 @@ def from_subscript(value: str) -> str: Convert unicode subscript characters to normal characters. This allows us to parse, for example, H₂O as H2O. """ - subscript_codepoints = { + codepoints = { '\u2080': '0', '\u2081': '1', '\u2082': '2', '\u2083': '3', '\u2084': '4', '\u2085': '5', '\u2086': '6', '\u2087': '7', '\u2088': '8', '\u2089': '9', '\u208a': '+', '\u208b': '-', @@ -1124,11 +1111,26 @@ def from_subscript(value: str) -> str: '\u2098': 'm', '\u2099': 'n', '\u209a': 'p', '\u209b': 's', '\u209c': 't', } - return ''.join(subscript_codepoints.get(char, char) for char in str(value)) + return ''.join(codepoints.get(char, char) for char in str(value)) + +def from_superscript(value: str) -> str: + """ + Convert unicode superscript characters to normal characters. This allows us to parse, + for example, Ca²⁺ as Ca{2+}. + """ + codepoints = { + '\u2070': '0', '\u00B9': '1', '\u00B2': '2', '\u00B3': '3', + '\u2074': '4', '\u2075': '5', '\u2076': '6', '\u2077': '7', + '\u2078': '8', '\u2079': '9', '\u207a': '+', '\u207b': '-', + '\u207c': '=', '\u207d': '(', '\u207e': ')', + + '\u2071': 'i', '\u207f': 'n', + } + return ''.join(codepoints.get(char, char) for char in str(value)) def unicode_subscript(value: str) -> str: # Unicode subscript codepoints. Note that decimal point looks okay as subscript - subscript_codepoints = { + codepoints = { '0': '\u2080', '1': '\u2081', '2': '\u2082', '3': '\u2083', '4': '\u2084', '5': '\u2085', '6': '\u2086', '7': '\u2087', '8': '\u2088', '9': '\u2089', '+': '\u208a', '-': '\u208b', @@ -1142,11 +1144,11 @@ def unicode_subscript(value: str) -> str: '\u2013': '\u208b', # en-dash is same as dash '\u2014': '\u208b', # em-dash is same as dash } - return ''.join(subscript_codepoints.get(char, char) for char in str(value)) + return ''.join(codepoints.get(char, char) for char in str(value)) def unicode_superscript(value: str) -> str: # Unicode subscript codepoints. Note that decimal point looks okay as subscript - superscript_codepoints = { + codepoints = { #'.': '\u00B0', # degree symbol looks too much like zero #'.': ' \u02D9', # dot above modifier looks okay in a floating string, but risky #'.': ' \u0307', # space with dot above? @@ -1162,7 +1164,7 @@ def unicode_superscript(value: str) -> str: '\u2013': '\u207b', # en-dash is same as dash '\u2014': '\u207b', # em-dash is same as dash } - return ''.join(superscript_codepoints.get(char, char) for char in str(value)) + return ''.join(codepoints.get(char, char) for char in str(value)) SUBSCRIPT: dict[str, Callable[[str], str]] = { # The latex renderer should work for github style markdown @@ -1171,32 +1173,82 @@ def unicode_superscript(value: str) -> str: 'unicode': unicode_subscript, 'plain': lambda text: text } -def pretty(compound: Formula, mode: str='unicode') -> str: +SUPERSCRIPT: dict[str, Callable[[str], str]] = { + # The latex renderer should work for github style markdown + 'latex': lambda text: f'$^{{{text}}}$', + 'html': lambda text: f'{text}', + 'unicode': unicode_superscript, + 'plain': lambda text: text, +} + +class PrettyFormula: """ - Convert the formula to a string. The *mode* can be 'unicode', 'html' or - 'latex' depending on how subscripts should be rendered. If *mode* is 'plain' - then don't use subscripts for the element quantities. + Formula pretty-printer. - Use *pretty(compound.hill)* for a more compact representation. + Formats formuls for output, using superscripts for isotope and valence and + subscripts for element counts. + + *mode* is unicode, latex, html or plain for no special formatting. """ - return _pretty(compound.structure, SUBSCRIPT[mode]) - -# TODO: type hinting for _pretty -def _pretty(structure, subscript: Callable[[str], str]) -> str: - # TODO: if superscript is not None then render O[16] as {}^{16}O - parts = [] - for count, part in structure: - if isinstance(part, tuple): - if count == 1: - parts.append(_pretty(part, subscript)) - else: - parts.append(f'({_pretty(part, subscript)}){subscript(count)}') - elif count == 1: - parts.append(f'{_str_one_atom(part)}') + mode: str + superscript: Callable[[str], str] + subscript: Callable[[str], str] + + def __init__(self, mode): + self.mode = mode + self.subscript = SUBSCRIPT[mode] + self.superscript = SUPERSCRIPT[mode] + + def walk_atom(self, atom): + if self.mode == 'plain': + return str(atom) + if ision(atom): + charge = '-' if atom.charge < 0 else '+' + magnitude = abs(atom.charge) + valence = charge*magnitude if magnitude < 2 else f"{magnitude}{charge}" + valence = self.superscript(valence) + atom = atom.element + else: + valence = "" + if isisotope(atom) and atom.symbol == atom.element.symbol: + isotope = self.superscript(str(atom.isotope)) else: - parts.append(f'{_str_one_atom(part)}{subscript(count)}') - return ''.join(parts) + isotope = "" + return f"{isotope}{atom.symbol}{valence}" + + def format(self, compound: Formula): + if self.mode == 'plain': + return str(compound) + return self.walk(compound.structure) + + def walk(self, structure): + parts = [] + for count, part in structure: + if isinstance(part, tuple): + if count == 1: + parts.append(self.walk(part)) + else: + parts.append(f'({self.walk(part)}){self.subscript(count)}') + elif count == 1: + parts.append(self.walk_atom(part)) + else: + parts.append(f'{self.walk_atom(part)}{self.subscript(count)}') + return ''.join(parts) + +def pretty(compound: Formula, mode: str='unicode') -> str: + """ + Convert the formula to a string. + + *mode* is unicode, html, latex, plain [default = unicode] + + If *mode* is 'plain' then don't use superscipts and subscripts for rendering. + + Use *pretty(compound.hill)* for a more compact representation. + """ + if mode is None: + mode = 'unicode' + return PrettyFormula(mode).format(compound) def demo(): import sys From a30069e5b214746fa63398a2789e441c2251f551 Mon Sep 17 00:00:00 2001 From: Paul Kienzle Date: Fri, 6 Mar 2026 15:06:11 -0500 Subject: [PATCH 07/19] fix tests --- explore/lark_parse.py | 1 + periodictable/core.py | 2 +- periodictable/formulas.py | 41 ++++++++++++++++++++++++++------------- test/test_core.py | 2 +- 4 files changed, 31 insertions(+), 15 deletions(-) diff --git a/explore/lark_parse.py b/explore/lark_parse.py index 40ae090..24c38e3 100644 --- a/explore/lark_parse.py +++ b/explore/lark_parse.py @@ -838,6 +838,7 @@ def check(): density = getattr(tree, 'density', None) density_str = f" @ {density:.2f}" if density else "" mode = 'unicode' # unicode latex html plain + # mode = 'plain' print(f" => {pretty_formula(tree, mode)}{density_str}") # print(f" {getattr(tree, 'structure', None)}") except Exception as exc: diff --git a/periodictable/core.py b/periodictable/core.py index 9c8e496..fb095b3 100644 --- a/periodictable/core.py +++ b/periodictable/core.py @@ -398,7 +398,7 @@ def __init__(self, element_or_isotope: Element|Isotope): def __getitem__(self, charge: int) -> Ion: if charge not in self.ionset: if charge not in self.element_or_isotope.ions: - valence = f"{abs(charge)}{'+' if charge > 0 else '-'}" + valence = f"{abs(charge)}{'-' if charge < 0 else '+'}" symbol = self.element_or_isotope.symbol raise ValueError(f"valence {valence} is not valid for {symbol}") self.ionset[charge] = Ion(self.element_or_isotope, charge) diff --git a/periodictable/formulas.py b/periodictable/formulas.py index 9e697ff..2b118d7 100644 --- a/periodictable/formulas.py +++ b/periodictable/formulas.py @@ -675,7 +675,7 @@ def __rmul__(self, other): return ret def __str__(self): - return self.name if self.name else _str_atoms(self.structure) + return self.name if self.name else "".join(_str_atoms(self.structure)) def __repr__(self): return "formula('%s')"%(str(self)) @@ -1073,24 +1073,41 @@ def _convert_to_hill_notation(atoms: dict[Atom, float]) -> Structure: """ return tuple((atoms[el], el) for el in sorted(atoms.keys(), key=_hill_key)) -def _str_atoms(seq) -> str: +def _str_one_atom(atom: Atom) -> str: + """ + Format a single atom as SYMBOL[ISOTOPE]{VALENCE}. + + Can't use str(atom) => ISOTOPE-SYMBOL{VALENCE} or repr(atom) => SYMBOL[ISOTOPE].ion[VALENCE] + """ + valence = isotope = "" + if ision(atom): + ion = cast(Ion, atom) + charge = '-' if ion.charge < 0 else '+' + magnitude = abs(ion.charge) + valence = charge*magnitude if magnitude < 2 else f"{magnitude}{charge}" + valence = "{%s}"%valence + atom = ion.element + if isisotope(atom): + iso = cast(Isotope, atom) + if iso.symbol == iso.element.symbol: + isotope = f"[{iso.isotope}]" + return f"{atom.symbol}{isotope}{valence}" + +def _str_atoms(seq) -> list[str]: """ Convert formula structure to string. """ #print "str", seq - ret = "" + ret = [] for count, fragment in seq: if isatom(fragment): - ret += str(fragment) + ret.append(_str_one_atom(fragment)) if count != 1: - ret += "%g"%count + ret.append(f"{count:g}") + elif count == 1: + ret.extend(_str_atoms(fragment)) else: - if count == 1: - piece = _str_atoms(fragment) - else: - piece = "(%s)%g"%(_str_atoms(fragment), count) - #ret = ret+" "+piece if ret else piece - ret += piece + ret.extend(("(", *_str_atoms(fragment), ")", f"{count:g}")) return ret @@ -1200,8 +1217,6 @@ def __init__(self, mode): self.superscript = SUPERSCRIPT[mode] def walk_atom(self, atom): - if self.mode == 'plain': - return str(atom) if ision(atom): charge = '-' if atom.charge < 0 else '+' magnitude = abs(atom.charge) diff --git a/test/test_core.py b/test/test_core.py index d010a76..c86b9a0 100644 --- a/test/test_core.py +++ b/test/test_core.py @@ -64,7 +64,7 @@ def test(): Fe.ion[-3] raise Exception("accepts invalid ions") except ValueError as msg: - assert str(msg) == "-3 is not a valid charge for Fe" + assert str(msg) == "valence 3- is not valid for Fe" assert data_files()[0][0] == "periodictable-data/xsf" From f8cecea9072a6b060763a4621f962968c16c7b58 Mon Sep 17 00:00:00 2001 From: Paul Kienzle Date: Wed, 20 May 2026 20:01:38 -0400 Subject: [PATCH 08/19] add output type hints to ast transforms --- explore/lark_parse.py | 135 ++++++++++++++++++++++------------------- periodictable/fasta.py | 8 +-- 2 files changed, 78 insertions(+), 65 deletions(-) diff --git a/explore/lark_parse.py b/explore/lark_parse.py index 24c38e3..ac412b2 100644 --- a/explore/lark_parse.py +++ b/explore/lark_parse.py @@ -1,10 +1,9 @@ import lark -import periodictable as pt -from periodictable.core import PeriodicTable +from periodictable.core import PeriodicTable, Element, Atom, Isotope from periodictable.core import default_table from periodictable.formulas import ( from_subscript, from_superscript, - Formula, + Formula, Structure, _mix_by_weight_pairs, _mix_by_volume_pairs, VOLUME_UNITS, MASS_UNITS, LENGTH_UNITS, pretty as pretty_formula @@ -89,13 +88,16 @@ def int_or_float(s): i = int(f) return i if i == f else f -class StripJunk(lark.Transformer): +class StripPunctuation(lark.Transformer): """ Token stripper visitor class. This is done separately from the formula composer so that we can show the cleaned tree before debugging the conversion. + Unnamed punctuation characters []{}():% and units (kg, mL, nm, ...) which are represented + as quoted strings in the grammar have no associated token. + Note: could get the same effect by renaming the unused terminals with leading underscore, but that makes the grammar harder to read. """ @@ -173,20 +175,20 @@ def COUNT(self, token: lark.Token) -> int|float: return it as an integer. """ return int_or_float(from_subscript(token.value)) - def SUPERINT(self, token): + def SUPERINT(self, token) -> int: """ Return the integer value of a sequence of superscript digits. This is used to specify the valence or to specify the isotope. """ return int(from_superscript(token.value)) - def DENSITYMODE(self, token): + def DENSITYMODE(self, token) -> str: """ Return the value of the DENSITYMODE token, either "n" or "i". If no mode is specified then a token value of None will be given to the density rule. """ return token.value - def CHARGE(self, token): + def CHARGE(self, token) -> int: """ Return a sequence of plus and minus characters. By grammar rules they must all have the same sign. @@ -194,14 +196,14 @@ def CHARGE(self, token): This is used in the valence rule to specify the charge for the atom. """ return token.value - def SUPERCHARGE(self, token): + def SUPERCHARGE(self, token) -> int: """ Convert sequence of superscript plus and minus characters to ASCII plus and minus. This is used in the valence rule to specify the charge for the atom. """ return from_superscript(token.value) - def SYMBOL(self, token): + def SYMBOL(self, token) -> Element: """ Look up the element in the periodic table and return it. @@ -211,32 +213,24 @@ def SYMBOL(self, token): return self._table.symbol(token.value) except Exception: raise ValueError(f"Element {token.value} doesn't exist") - def FASTA(self, token): + def FASTA(self, token) -> str: """ Return the token value as the fasta sequence type: "dna", "rna" or "aa". """ return token.value - def SEQUENCE(self, token): + def SEQUENCE(self, token) -> str: """ Return the token value as the fasta sequence string. """ return token.value - def fasta(self, tokens): - """ - Return a fasta sequence and its type. - - Transform: [type, sequence] => ('fasta', type, sequence) - """ - stype, sequence = tokens - return 'fasta', stype, sequence - def isotope(self, tokens): + def isotope(self, tokens) -> int: """ Return the isotope number for the atom. Transform: [isotope] => isotope """ return tokens[0] - def valence(self, tokens): + def valence(self, tokens) -> int: """ Return valence from number and sign. @@ -267,7 +261,7 @@ def valence(self, tokens): raise ValueError(f"Use {value}{charge[0]} instead of {value}{charge} for valence") valence = value if charge[0] == '+' else -value return valence - def atom(self, tokens): + def atom(self, tokens) -> Atom: """ Returns an atom from the periodic table. @@ -297,7 +291,7 @@ def atom(self, tokens): #print(f"atom {tokens} => {atom}") return atom - def isoatom(self, tokens): + def isoatom(self, tokens) -> Atom: """ Returns an isotope from the periodic table. @@ -319,37 +313,46 @@ def isoatom(self, tokens): return atom - def group(self, tokens): + def group(self, tokens) -> Structure: """ Returns a sequence of (count, item) pairs, where item is an atom or a nested formula. Missing counts default to 1. Transform: [atom|formula, count|None, ...] => ((count, atom|formula), ...) + + Example CaCO3: [Ca, None, C, None, O, 3] + => ((1, Ca), (1, C), (3, O)) """ + # print("group tokens", tokens) tokens = [1 if value is None else value for value in tokens] pairs = tuple((count, item) for item, count in zip(tokens[::2], tokens[1::2])) + # print("group output", pairs) return pairs - def composite(self, tokens): + def composite(self, tokens) -> Structure: """ Returns a sequence of (number, group) pairs. Each group is a sequence of (count, item) pairs, where item is an atom or a nested formula. Missing numbers default to 1. Transform: [number|None, group, ...] => ((number, group), ...) | ((count, atom), ...) - Example CaCO3 6H2O: None, ((1, Ca), (1, C), (3, O)), 6, ((2, H), (1, O))] + Example CaCO3 6H2O: [None, ((1, Ca), (1, C), (3, O)), 6, ((2, H), (1, O))] => ((1, ((1, Ca), (1, C), (3, O))), (6, ((2, H), (1, O)))) - Example CaCO3(H20)6: [[None, ((1, Ca), (1, C), (3, O), (6, formula('H2O')))] - => ((1, Ca), (1, C), (3, O), (6, formula('H2O'))) + Example CaCO3(H2O)6: [None, ((1, Ca), (1, C), (3, O), (6, formula('H2O')))] + => ((1, ((1, Ca), (1, C), (3, O), (6, formula('H2O')))),) + + Example CaCO3 (H2O)6: [None, ((1, Ca), (1, C), (3, O)), None, ((6, formula('H2O')),)] + => ((1, ((1, Ca), (1, C), (3, O))), (1, ((6, formula('H2O')),))) """ - # print("in composite", tokens) + # print("composite tokens", tokens) numbers = [1 if v is None else v for v in tokens[::2]] groups = tokens[1::2] pairs = tuple((number, group) for number, group in zip(numbers, groups)) + # print("composite output", pairs) return pairs - def fasta(self, tokens): + def fasta(self, tokens) -> Structure: """ Returns the formula corresponding to the FASTA sequence, with the natural density set. Labile hydrogen use H[1] in the formula. @@ -359,23 +362,25 @@ def fasta(self, tokens): Transform: [ /aa|dna|rna/, /[A-Z -*]+/ ] => (1, ((1, formula),)) - Example dna:CAGT: ['dna', 'CAGT'] => (1, ((1, C39H37H[1]10N15O25P4@1.69),)) + Example dna:CAGT: ['dna', 'CAGT'] + => ((1, ((1, formula('C39H37H[1]10N15O25P4')),)),) """ # TODO: fasta is ignoring table when parsing # TODO: avoid circular imports # TODO: support other biochemicals (carbohydrate residues, lipids) - from periodictable import fasta + from periodictable.fasta import CODE_TABLES, Sequence - # print("in fasta", tokens) + # print("fasta input", tokens) seq_type, seq = tokens - if seq_type not in fasta.CODE_TABLES: + if seq_type not in CODE_TABLES: raise ValueError(f"Invalid fasta sequence type '{seq_type}:'") - seq = fasta.Sequence(name=None, sequence=seq, type=seq_type) - group = ((1, seq.labile_formula),) - composite = ((1, group),) + seq = Sequence(name=None, sequence=seq, type=seq_type) + pairs = ((1, seq.labile_formula),) + composite = ((1, pairs), ) + # print("fasta output", composite) return composite - def density(self, tokens): + def density(self, tokens) -> tuple[str, float, str]: """ Returns a density tuple from the @density construct. Density mode 'n' for natural or 'i' for isotopic defaults to isotopic. That is, D2O@1.11 is the @@ -394,7 +399,7 @@ def density(self, tokens): mode = 'i' if not tokens[1] else tokens[1] return 'density', value, mode - def compound(self, tokens): + def compound(self, tokens) -> Formula: """ Returns the formula for the compound, with optional density set. @@ -411,15 +416,15 @@ def compound(self, tokens): Transform: [((number, group), ...), ('density', value, mode)|None] => formula - Example NaCl@2.16i: [(1, ((1, Na), (1, Cl))), ('density', 2.16, 'i')] => NaCl@2.16i + Example NaCl@2.16i: [((1, ((1, Na), (1, Cl))),), ('density', 2.16, 'i')] => NaCl@2.16i - Example dna:CAGT: [((1, ((1, C39H37H[1]10N15O25P4@1.69n),)),), None] => C39H37H[1]10N15O25P4@1.69n + Example dna:CAGT: [((1, ((1, formula('C39H37H[1]10N15O25P4')),)),), None] => C39H37H[1]10N15O25P4@1.69n Example CaCO3 6H2O: [((1, ((1, Ca), (1, C), (3, O))), (6, ((2, H), (1, O)))), None] => CaCO3(H2O)6 - Example CaCO3(H20)6: [((1, ((1, Ca), (1, C), (3, O), (6, H2O@None))),), None] => CaCO3(H2O)6 + Example CaCO3(H2O)6: [((1, ((1, Ca), (1, C), (3, O), (6, formula('H2O')))),), None] => CaCO3(H2O)6 """ - # print("in compound with", tokens) + # print("compound tokens", tokens) components, density_tuple = tokens if density_tuple is None: density, density_mode = None, 'i' @@ -458,10 +463,10 @@ def expand_formula(group): else: formula.density = density - # print(f"compound = {formula} @ {formula.density}") + # print(f"compound output {formula} @ {formula.density}") return formula - def weightpct(self, tokens): + def weightpct(self, tokens) -> float: """ Returns the percentage. The value has already be converted to a number. @@ -473,7 +478,7 @@ def weightpct(self, tokens): """ return tokens[0] - def volumepct(self, tokens): + def volumepct(self, tokens) -> float: """ Returns the percentage. The value has already be converted to a number. @@ -485,7 +490,7 @@ def volumepct(self, tokens): """ return tokens[0] - def percentage(self, tokens): + def percentage(self, tokens) -> float: """ Returns the percentage. The value has already be converted to a number. @@ -495,7 +500,7 @@ def percentage(self, tokens): """ return tokens[0] - def byweight(self, tokens): + def byweight(self, tokens) -> Formula: """ Returns mixture by wt% of the various components in the system. @@ -516,7 +521,7 @@ def byweight(self, tokens): # print(f"byweight => {formula} @ {formula.density}") return formula - def byvolume(self, tokens): + def byvolume(self, tokens) -> Formula: """ Returns mixture by vol% of the various components in the system. Volumes are converted to mass using density. @@ -540,7 +545,7 @@ def byvolume(self, tokens): formula = _mix_by_volume_pairs(pairs) return formula - def byamount(self, tokens): + def byamount(self, tokens) -> Formula: """ Returns mixture by mass of the various components in the system. Volumes are converted to mass using density. @@ -568,7 +573,7 @@ def find_value(quantity, formula): formula.total_mass = total return formula - def layers(self, tokens): + def layers(self, tokens) -> Formula: """ Returns the mixture by volume of the various layers in the system. @@ -590,7 +595,7 @@ def layers(self, tokens): formula.thickness = total return formula - def mixture(self, tokens): + def mixture(self, tokens) -> Formula: """ Returns the formula representing the mixture, either byweight, byvolume, byamount or layers @@ -598,7 +603,7 @@ def mixture(self, tokens): """ return tokens[0] - def formula(self, tokens): + def formula(self, tokens) -> Formula: """ Return the formula representing the compound or mixture. @@ -606,7 +611,7 @@ def formula(self, tokens): """ return tokens[0] - def thickness(self, tokens): + def thickness(self, tokens) -> tuple[str, float, str]: """ Returns (dimension, value, unit) with dimension equal 'length' @@ -617,7 +622,7 @@ def thickness(self, tokens): value, (dim, units) = tokens return dim, value, units - def quantity(self, tokens): + def quantity(self, tokens) -> tuple[str, float, str]: """ Returns (dimension, value, unit) with dimension equal 'mass' or 'volume' @@ -628,7 +633,7 @@ def quantity(self, tokens): value, (dim, units) = tokens return dim, value, units - def start(self, tokens): + def start(self, tokens) -> Formula: """ Return the final formula, with the original text attached. @@ -708,7 +713,7 @@ def parse_formula(formula_str: str, table: PeriodicTable|None=None) -> Formula: Parse a chemical formula, returning a structure with elements from the given periodic table. """ - cleanup = StripJunk() + cleanup = StripPunctuation() convert = ConvertTokens(formula_str, table=table) try: tree = formula_parser.parse(formula_str) @@ -821,6 +826,8 @@ def parse_formula(formula_str: str, table: PeriodicTable|None=None) -> Formula: """ def check(): + from periodictable.formulas import parse_formula as old_parser + for line in examples.split('\n'): formula = line.split('#')[0] bad = line.startswith('!') @@ -833,8 +840,8 @@ def check(): print(f"*** {line}") try: # Toggle the following to test pyparsing vs lark - tree = parse_formula(formula) - #tree = pt.formula(formula) if "##" not in line else "!!! pyparsing fails" + #tree = parse_formula(formula) + tree = old_parser(formula) if "##" not in line else "!!! pyparsing fails" density = getattr(tree, 'density', None) density_str = f" @ {density:.2f}" if density else "" mode = 'unicode' # unicode latex html plain @@ -847,9 +854,15 @@ def check(): else: raise exc from None else: - if '##' in line: continue # pyparsing should fail but doesn't + if '##' in line: + continue # pyparsing should fail but doesn't if bad: raise RuntimeError(f"Exception not raised for <{formula}>") if __name__ == "__main__": - check() \ No newline at end of file + import sys + if len(sys.argv) > 1: + for arg in sys.argv[1:]: + print(parse_formula(arg)) + else: + check() \ No newline at end of file diff --git a/periodictable/fasta.py b/periodictable/fasta.py index b378477..f95d9ed 100644 --- a/periodictable/fasta.py +++ b/periodictable/fasta.py @@ -75,7 +75,7 @@ from collections.abc import Iterator from typing import IO, cast -from .formulas import formula as parse_formula, Formula, FormulaInput +from .formulas import formula as make_formula, Formula, FormulaInput from .nsf import neutron_sld from .xsf import xray_sld from .core import default_table, Atom @@ -177,7 +177,7 @@ def __init__( elements = default_table() # Fill in density or cell_volume. - M = parse_formula(formula, natural_density=density) + M = make_formula(formula, natural_density=density) # CRUFT: use of T rather than H[1] is deprecated since 1.5.3 if elements.T in M.atoms: warnings.warn("Use of tritium for labile hydrogen is deprecated." @@ -274,7 +274,7 @@ def __init__(self, name: str, sequence: str, type: str='aa'): structure.extend(list(p.labile_formula.structure)) # Add H + OH terminators to the sequence structure.extend(((2, elements.H[1]), (1, elements.O))) - formula = parse_formula(structure).hill + formula = make_formula(structure).hill Molecule.__init__( self, name, formula, cell_volume=cell_volume, charge=charge) @@ -356,7 +356,7 @@ def _code_average(bases, code_table) -> tuple[Formula, float, float]: Note: averaging can lead to a fractional charge on the returned molecule. """ n = len(bases) - formula, cell_volume, charge = parse_formula(), 0., 0. + formula, cell_volume, charge = make_formula(), 0., 0. for c in bases: base = code_table[c] formula += base.labile_formula From 3412542bc6a244b72b4490d9115a5a91ab0de327 Mon Sep 17 00:00:00 2001 From: Paul Kienzle Date: Wed, 20 May 2026 20:22:20 -0400 Subject: [PATCH 09/19] adjust imports; improve :SEQ error message --- {explore => periodictable}/lark_parse.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) rename {explore => periodictable}/lark_parse.py (99%) diff --git a/explore/lark_parse.py b/periodictable/lark_parse.py similarity index 99% rename from explore/lark_parse.py rename to periodictable/lark_parse.py index ac412b2..d5479d4 100644 --- a/explore/lark_parse.py +++ b/periodictable/lark_parse.py @@ -1,7 +1,7 @@ import lark -from periodictable.core import PeriodicTable, Element, Atom, Isotope -from periodictable.core import default_table -from periodictable.formulas import ( +from .core import PeriodicTable, Element, Atom, Isotope +from .core import default_table +from .formulas import ( from_subscript, from_superscript, Formula, Structure, _mix_by_weight_pairs, _mix_by_volume_pairs, @@ -663,7 +663,7 @@ def _allowed(allowed): NUMBER="NUMBER", # start of compound or start of mixture #FASTA="[dna|rna|aa]:SEQ", FASTA="aa:SEQ", - COLON=":", + COLON=":SEQ", #COLON="aa:SEQ", SEQUENCE="aa:SEQ", SEPARATOR="+", # generic group separator in composite @@ -840,8 +840,8 @@ def check(): print(f"*** {line}") try: # Toggle the following to test pyparsing vs lark - #tree = parse_formula(formula) - tree = old_parser(formula) if "##" not in line else "!!! pyparsing fails" + tree = parse_formula(formula) + #tree = old_parser(formula) if "##" not in line else "!!! pyparsing fails" density = getattr(tree, 'density', None) density_str = f" @ {density:.2f}" if density else "" mode = 'unicode' # unicode latex html plain @@ -859,10 +859,14 @@ def check(): if bad: raise RuntimeError(f"Exception not raised for <{formula}>") -if __name__ == "__main__": +def main(): import sys + if len(sys.argv) > 1: for arg in sys.argv[1:]: print(parse_formula(arg)) else: - check() \ No newline at end of file + check() + +if __name__ == "__main__": + main() From a1579af92edf624517ff999ef78c7a4a188fa0ba Mon Sep 17 00:00:00 2001 From: Paul Kienzle Date: Wed, 20 May 2026 20:56:38 -0400 Subject: [PATCH 10/19] move unicode superscript/subscript conversion to util; clean up type hints --- periodictable/formulas.py | 84 ++++--------------------------------- periodictable/lark_parse.py | 14 ++++--- periodictable/util.py | 72 +++++++++++++++++++++++++++++++ 3 files changed, 89 insertions(+), 81 deletions(-) diff --git a/periodictable/formulas.py b/periodictable/formulas.py index 2b118d7..0dc2fc1 100644 --- a/periodictable/formulas.py +++ b/periodictable/formulas.py @@ -6,7 +6,7 @@ from copy import copy from math import pi, sqrt -from typing import cast, Union, Any +from typing import cast, Union, Any, Iterable from collections.abc import Sequence, Callable # Requires that the pyparsing module is installed. @@ -15,9 +15,9 @@ ZeroOrMore, OneOrMore, Forward, StringEnd, Group) from .core import default_table, isatom, isisotope, ision, change_table -from .core import Atom, Element, Isotope, Ion, PeriodicTable # for typing +from .core import Atom, Isotope, Ion, PeriodicTable # for typing from .constants import avogadro_number, electron_mass -from .util import cell_volume +from .util import cell_volume, unicode_subscript, unicode_superscript FormulaInput = Union[str, "Formula", Atom, dict[Atom, float], Sequence[tuple[float, Any]], None] Fragment = tuple[float, Union[Atom, "Structure"]] @@ -89,7 +89,7 @@ def mix_by_weight(*args, **kw) -> "Formula": result.name = name return result -def _mix_by_weight_pairs(pairs: list[tuple["Formula", float]]) -> "Formula": +def _mix_by_weight_pairs(pairs: Iterable[tuple["Formula", float]]) -> "Formula": from .formulas import Formula # For running as __main__ # Drop pairs with zero quantity @@ -175,7 +175,7 @@ def mix_by_volume(*args, **kw) -> "Formula": result.name = name return result -def _mix_by_volume_pairs(pairs: list[tuple["Formula", float]]) -> "Formula": +def _mix_by_volume_pairs(pairs: Iterable[tuple["Formula", float]]) -> "Formula": from .formulas import Formula # For running as __main__ # Drop pairs with zero quantity @@ -330,6 +330,8 @@ class Formula: structure: Structure density: float|None name: str|None + total_mass: float|None = None + thickness: float|None = None def __init__(self, structure: Structure=tuple(), @@ -738,6 +740,7 @@ def formula_grammar(table: PeriodicTable) -> ParserElement: # This ickiness is because the formula class returned from the circular # import of fasta does not match the local formula class. from .formulas import Formula + from .util import from_subscript, from_superscript # Recursive composite = Forward() @@ -1112,77 +1115,6 @@ def _str_atoms(seq) -> list[str]: return ret -def from_subscript(value: str) -> str: - """ - Convert unicode subscript characters to normal characters. This allows us to parse, - for example, H₂O as H2O. - """ - codepoints = { - '\u2080': '0', '\u2081': '1', '\u2082': '2', '\u2083': '3', - '\u2084': '4', '\u2085': '5', '\u2086': '6', '\u2087': '7', - '\u2088': '8', '\u2089': '9', '\u208a': '+', '\u208b': '-', - '\u208c': '=', '\u208d': '(', '\u208e': ')', - - '\u2090': 'a', '\u2091': 'e', '\u2092': 'o', '\u2093': 'x', - '\u2095': 'h', '\u2096': 'k', '\u2097': 'l', - '\u2098': 'm', '\u2099': 'n', '\u209a': 'p', '\u209b': 's', - '\u209c': 't', - } - return ''.join(codepoints.get(char, char) for char in str(value)) - -def from_superscript(value: str) -> str: - """ - Convert unicode superscript characters to normal characters. This allows us to parse, - for example, Ca²⁺ as Ca{2+}. - """ - codepoints = { - '\u2070': '0', '\u00B9': '1', '\u00B2': '2', '\u00B3': '3', - '\u2074': '4', '\u2075': '5', '\u2076': '6', '\u2077': '7', - '\u2078': '8', '\u2079': '9', '\u207a': '+', '\u207b': '-', - '\u207c': '=', '\u207d': '(', '\u207e': ')', - - '\u2071': 'i', '\u207f': 'n', - } - return ''.join(codepoints.get(char, char) for char in str(value)) - -def unicode_subscript(value: str) -> str: - # Unicode subscript codepoints. Note that decimal point looks okay as subscript - codepoints = { - '0': '\u2080', '1': '\u2081', '2': '\u2082', '3': '\u2083', - '4': '\u2084', '5': '\u2085', '6': '\u2086', '7': '\u2087', - '8': '\u2088', '9': '\u2089', '+': '\u208a', '-': '\u208b', - '=': '\u208c', '(': '\u208d', ')': '\u208e', - - 'a': '\u2090', 'e': '\u2091', 'o': '\u2092', 'x': '\u2093', - 'h': '\u2095', 'k': '\u2096', 'l': '\u2097', - 'm': '\u2098', 'n': '\u2099', 'p': '\u209a', 's': '\u209b', - 't': '\u209c', - - '\u2013': '\u208b', # en-dash is same as dash - '\u2014': '\u208b', # em-dash is same as dash - } - return ''.join(codepoints.get(char, char) for char in str(value)) - -def unicode_superscript(value: str) -> str: - # Unicode subscript codepoints. Note that decimal point looks okay as subscript - codepoints = { - #'.': '\u00B0', # degree symbol looks too much like zero - #'.': ' \u02D9', # dot above modifier looks okay in a floating string, but risky - #'.': ' \u0307', # space with dot above? - #'.': '\u22C5', # math dot operator - '.': '\u1427', # Canadian aboriginal extended block dot (looks good on mac) - '2': '\u00B2', '3': '\u00B3', - '1': '\u00B9', - '0': '\u2070', 'i': '\u2071', - '4': '\u2074', '5': '\u2075', '6': '\u2076', '7': '\u2077', - '9': '\u2078', '0': '\u2079', '+': '\u207a', '-': '\u207b', - '=': '\u207c', '(': '\u207d', ')': '\u207e', 'n': '\u207f', - - '\u2013': '\u207b', # en-dash is same as dash - '\u2014': '\u207b', # em-dash is same as dash - } - return ''.join(codepoints.get(char, char) for char in str(value)) - SUBSCRIPT: dict[str, Callable[[str], str]] = { # The latex renderer should work for github style markdown 'latex': lambda text: f'$_{{{text}}}$', diff --git a/periodictable/lark_parse.py b/periodictable/lark_parse.py index d5479d4..581bb5c 100644 --- a/periodictable/lark_parse.py +++ b/periodictable/lark_parse.py @@ -1,13 +1,16 @@ +from typing import cast + import lark + from .core import PeriodicTable, Element, Atom, Isotope from .core import default_table from .formulas import ( - from_subscript, from_superscript, Formula, Structure, _mix_by_weight_pairs, _mix_by_volume_pairs, VOLUME_UNITS, MASS_UNITS, LENGTH_UNITS, pretty as pretty_formula ) +from .util import from_subscript, from_superscript # TODO: valence belongs to a group rather than element @@ -196,7 +199,7 @@ def CHARGE(self, token) -> int: This is used in the valence rule to specify the charge for the atom. """ return token.value - def SUPERCHARGE(self, token) -> int: + def SUPERCHARGE(self, token) -> str: """ Convert sequence of superscript plus and minus characters to ASCII plus and minus. @@ -374,11 +377,12 @@ def fasta(self, tokens) -> Structure: seq_type, seq = tokens if seq_type not in CODE_TABLES: raise ValueError(f"Invalid fasta sequence type '{seq_type}:'") - seq = Sequence(name=None, sequence=seq, type=seq_type) + seq = Sequence(name="seq", sequence=seq, type=seq_type) pairs = ((1, seq.labile_formula),) - composite = ((1, pairs), ) + composite = ((1, pairs),) # print("fasta output", composite) - return composite + # return tuple[tuple[int, tuple[tuple[int, Formula]]]] as Structure + return cast(Structure, composite) def density(self, tokens) -> tuple[str, float, str]: """ diff --git a/periodictable/util.py b/periodictable/util.py index d7fa8ec..0b7267c 100644 --- a/periodictable/util.py +++ b/periodictable/util.py @@ -53,6 +53,78 @@ def parse_uncertainty(s: str) -> tuple[float, float]|tuple[None, None]: # Plain value with no uncertainty return float(s), 0 +def from_subscript(value: str) -> str: + """ + Convert unicode subscript characters to normal characters. This allows us to parse, + for example, H₂O as H2O. + """ + codepoints = { + '\u2080': '0', '\u2081': '1', '\u2082': '2', '\u2083': '3', + '\u2084': '4', '\u2085': '5', '\u2086': '6', '\u2087': '7', + '\u2088': '8', '\u2089': '9', '\u208a': '+', '\u208b': '-', + '\u208c': '=', '\u208d': '(', '\u208e': ')', + + '\u2090': 'a', '\u2091': 'e', '\u2092': 'o', '\u2093': 'x', + '\u2095': 'h', '\u2096': 'k', '\u2097': 'l', + '\u2098': 'm', '\u2099': 'n', '\u209a': 'p', '\u209b': 's', + '\u209c': 't', + } + return ''.join(codepoints.get(char, char) for char in str(value)) + +def from_superscript(value: str) -> str: + """ + Convert unicode superscript characters to normal characters. This allows us to parse, + for example, Ca²⁺ as Ca{2+}. + """ + codepoints = { + '\u2070': '0', '\u00B9': '1', '\u00B2': '2', '\u00B3': '3', + '\u2074': '4', '\u2075': '5', '\u2076': '6', '\u2077': '7', + '\u2078': '8', '\u2079': '9', '\u207a': '+', '\u207b': '-', + '\u207c': '=', '\u207d': '(', '\u207e': ')', + + '\u2071': 'i', '\u207f': 'n', + } + return ''.join(codepoints.get(char, char) for char in str(value)) + +def unicode_subscript(value: str) -> str: + # Unicode subscript codepoints. Note that decimal point looks okay as subscript + codepoints = { + '0': '\u2080', '1': '\u2081', '2': '\u2082', '3': '\u2083', + '4': '\u2084', '5': '\u2085', '6': '\u2086', '7': '\u2087', + '8': '\u2088', '9': '\u2089', '+': '\u208a', '-': '\u208b', + '=': '\u208c', '(': '\u208d', ')': '\u208e', + + 'a': '\u2090', 'e': '\u2091', 'o': '\u2092', 'x': '\u2093', + 'h': '\u2095', 'k': '\u2096', 'l': '\u2097', + 'm': '\u2098', 'n': '\u2099', 'p': '\u209a', 's': '\u209b', + 't': '\u209c', + + '\u2013': '\u208b', # en-dash is same as dash + '\u2014': '\u208b', # em-dash is same as dash + } + return ''.join(codepoints.get(char, char) for char in str(value)) + +def unicode_superscript(value: str) -> str: + # Unicode subscript codepoints. Note that decimal point looks okay as subscript + codepoints = { + #'.': '\u00B0', # degree symbol looks too much like zero + #'.': ' \u02D9', # dot above modifier looks okay in a floating string, but risky + #'.': ' \u0307', # space with dot above? + #'.': '\u22C5', # math dot operator + '.': '\u1427', # Canadian aboriginal extended block dot (looks good on mac) + '2': '\u00B2', '3': '\u00B3', + '1': '\u00B9', + '0': '\u2070', 'i': '\u2071', + '4': '\u2074', '5': '\u2075', '6': '\u2076', '7': '\u2077', + '9': '\u2078', '0': '\u2079', '+': '\u207a', '-': '\u207b', + '=': '\u207c', '(': '\u207d', ')': '\u207e', 'n': '\u207f', + + '\u2013': '\u207b', # en-dash is same as dash + '\u2014': '\u207b', # em-dash is same as dash + } + return ''.join(codepoints.get(char, char) for char in str(value)) + + def cell_volume(a=None, b=None, c=None, alpha=None, beta=None, gamma=None) -> float: r""" Compute cell volume from lattice parameters. From 0efb66629514b0b3f6f0c9b2eac21fed93e397bd Mon Sep 17 00:00:00 2001 From: Paul Kienzle Date: Thu, 21 May 2026 17:14:23 -0400 Subject: [PATCH 11/19] fix tests and doc build --- doc/sphinx/conf.py | 7 ++- doc/sphinx/genmods.py | 1 + doc/sphinx/guide/formula_grammar.rst | 66 ++++++++++++++++--------- periodictable/formulas.py | 58 ++++++++++++++-------- periodictable/lark_parse.py | 73 +++++++++++++++------------- pyproject.toml | 5 +- test/test_formulas.py | 24 ++++++--- 7 files changed, 148 insertions(+), 86 deletions(-) diff --git a/doc/sphinx/conf.py b/doc/sphinx/conf.py index dad6c9e..d65594d 100644 --- a/doc/sphinx/conf.py +++ b/doc/sphinx/conf.py @@ -27,6 +27,7 @@ sys.path.insert(0, os.path.abspath('../..')) sys.path.insert(0, os.path.abspath('_extensions')) import periodictable +from periodictable.lark_parse import grammar # -- General configuration ----------------------------------------------------- @@ -65,6 +66,11 @@ ('py:class', 'numpy._typing._array_like._ScalarT'), ('py:class', 'numpy._typing._nested_sequence._NestedSequence'), ('py:class', 'pyparsing.core.ParserElement'), + ('py:class', 'lark.tree.Tree'), + ('py:class', 'lark.lexer.Token'), + ('py:class', 'lark.visitors.Transformer'), + ('py:class', 'lark.visitors._Leaf_T'), + ('py:class', 'lark.visitors._Return_T'), ('py:class', 'periodictable.core._AtomBase'), ('py:class', 'periodictable.core.IonSet'), @@ -300,4 +306,3 @@ if os.path.exists('rst_prolog'): with io.open('rst_prolog', encoding='utf-8') as fid: rst_prolog = fid.read() - diff --git a/doc/sphinx/genmods.py b/doc/sphinx/genmods.py index 9cdc46a..4b930e5 100644 --- a/doc/sphinx/genmods.py +++ b/doc/sphinx/genmods.py @@ -55,6 +55,7 @@ def genfiles(package, package_name, modules, dir='api'): #('__init__', 'Top level namespace'), ('core', 'Core table'), ('formulas', 'Chemical formula operations'), + ('lark_parse', 'Chemical formula parser'), ('covalent_radius', 'Covalent radius'), ('constants', 'Fundamental constants'), ('crystal_structure', 'Crystal structure'), diff --git a/doc/sphinx/guide/formula_grammar.rst b/doc/sphinx/guide/formula_grammar.rst index db1ab2c..da694cd 100644 --- a/doc/sphinx/guide/formula_grammar.rst +++ b/doc/sphinx/guide/formula_grammar.rst @@ -159,28 +159,50 @@ The grammar used for parsing formula strings is the following: :: - formula :: compound | mixture | nothing - mixture :: quantity | percentage - quantity :: number unit part ('//' number unit part)* - percentage :: number 'wt%|vol%' part ('//' number '%' part)* '//' part - part :: compound | '(' mixture ')' - compound :: (composite | fasta) density? - fasta :: ('dna' | 'rna' | 'aa') ':' [A-Z -*]+ - composite :: group (separator group)* - group :: number element+ | '(' formula ')' number - element :: symbol isotope? ion? number? - symbol :: [A-Z][a-z]* - isotope :: '[' integer ']' - ion :: '{' integer? [+-] '}' - density :: '@' number [ni]? - number :: integer | fraction - integer :: [1-9][0-9]* - fraction :: ([1-9][0-9]* | 0)? '.' [0-9]* - separator :: space? '+'? space? - unit :: mass | volume | length - mass :: 'kg' | 'g' | 'mg' | 'ug' | 'ng' - volume :: 'L' | 'mL' | 'uL' | 'nL' - length :: 'cm' | 'mm' | 'um' | 'nm' + formula : compound | mixture + + # Mixture definitions: quantity compound // quantity compound // quantity compound + mixture : byamount | byvolume | byweight | layers + byamount : quantity compound (MIX quantity compound)* + byvolume : volumepct compound (MIX percentage compound)* MIX compound + byweight : weightpct compound (MIX percentage compound)* MIX compound + layers : thickness compound (MIX thickness compound)* + quantity : NUMBER SPACE? (MASS | VOLUME) SPACE + weightpct : NUMBER SPACE? WEIGHTPCT SPACE + volumepct : NUMBER SPACE? VOLUMEPCT SPACE + thickness : NUMBER SPACE? LENGTH SPACE + percentage : NUMBER SPACE? "%" SPACE # Allows "3 % " + + # Compound definition: number group ... @density where group is El count El count ... + # FASTA sequences: (rna|dna|aa) : SEQUENCE @ density + # Density applies to the entire formula, such as "NaCl + 29.2H2O @ 1.07n" + # For the density of a mixture you need parentheses: "(10 wt% NaCl // H2O)@1.07n" + compound : (composite | fasta) [density] + fasta : FASTA ":" SEQUENCE + composite : [NUMBER] group (SEPARATOR [NUMBER] group)* + group : ((atom | isoatom | "(" formula ")") [COUNT])+ + atom : SYMBOL [isotope] [valence] + isoatom : SUPERINT SYMBOL [valence] # For example ²H for deuterium + isotope : "[" INTEGER "]" + valence : "{" [INTEGER] CHARGE "}" | [SUPERINT] SUPERCHARGE + density : SPACE? "@" SPACE? DENSITY [DENSITYMODE] + + # Tokens + #FASTA : /dna|rna|aa/ # Sequence type is limited to these values but ... + FASTA : /[a-z]+/ # "type:sequence" syntax allows better error reporting + SEQUENCE : /[-A-Z *]+/ + # could list all elements, but better error reporting if element symbol lookup fails + SYMBOL : /[A-Z][a-z]*/ + CHARGE : /[+]+|[-]+/ # allow valence using {++} or {--} + DENSITY : NUMBER # using alias DENSITY for number for better error reporting + DENSITYMODE: /[ni]/ # n=natural density, i=isotopic density + MIX : SPACE? "//" SPACE? + WEIGHTPCT : /%w((eigh)?t)?/ | /w((eigh)?t)?%/ | /%m(ass)?/ | /m(ass)?%/ + VOLUMEPCT : /%v(ol(ume)?)?/ | /v(ol(ume)?)?%/ + MASS : "kg" | "g" | "mg" | "ug" | "μg" | "ng" + VOLUME : "L" | "mL" | "uL" | "μL" | "nL" + LENGTH : "cm" | "mm" | "um" | "μm" | "nm" | "Ang" | "Å" + COUNT : NUMBER | SUBNUM # atom counts can be normal numbers or unicode subscripts Formulas can also be constructed from atoms or other formulas: diff --git a/periodictable/formulas.py b/periodictable/formulas.py index 0dc2fc1..e752e3e 100644 --- a/periodictable/formulas.py +++ b/periodictable/formulas.py @@ -6,14 +6,9 @@ from copy import copy from math import pi, sqrt -from typing import cast, Union, Any, Iterable +from typing import cast, Union, Any, Iterable, TYPE_CHECKING from collections.abc import Sequence, Callable -# Requires that the pyparsing module is installed. - -from pyparsing import (ParserElement, Literal, Optional, White, Regex, - ZeroOrMore, OneOrMore, Forward, StringEnd, Group) - from .core import default_table, isatom, isisotope, ision, change_table from .core import Atom, Isotope, Ion, PeriodicTable # for typing from .constants import avogadro_number, electron_mass @@ -227,7 +222,7 @@ def formula( change in cell volume. *name* : string - Common name for the molecule. + Common name for the material. *table* : PeriodicTable Private table to use when parsing string formulas. @@ -288,6 +283,7 @@ def formula( display purposes. """ from .formulas import Formula # For running as __main__ + from .lark_parse import parse_formula structure: Structure if compound is None or compound == '': @@ -328,10 +324,25 @@ class Formula: Simple chemical formula representation. """ structure: Structure + """Nested structure ((count, atom|structure), ...)""" density: float|None + """ + |g/cm^3| + + Density of the material. + """ name: str|None + """ + Name of the material. Default is the input string for the formula parser. + """ total_mass: float|None = None + """ + For mixture by mass, the total mass of the mixture (g). + """ thickness: float|None = None + """ + For mixture by layer, the total thickness of the mixture (cm). + """ def __init__(self, structure: Structure=tuple(), @@ -413,7 +424,7 @@ def natural_density(self) -> float | None: """ |g/cm^3| - Density of the formula with specific isotopes of each element + Density of the material with specific isotopes of each element replaced by the naturally occurring abundance of the element without changing the cell volume. """ @@ -677,7 +688,8 @@ def __rmul__(self, other): return ret def __str__(self): - return self.name if self.name else "".join(_str_atoms(self.structure)) + # return self.name if self.name else "".join(_str_atoms(self.structure)) + return "".join(_str_atoms(self.structure)) def __repr__(self): return "formula('%s')"%(str(self)) @@ -711,15 +723,12 @@ def _isotope_substitution(compound: "Formula", source: Atom, target: Atom, porti density = compound.density return formula(atoms, density=density) +if TYPE_CHECKING: + from pyparsing import ParserElement # TODO: Grammar should be independent of table -# TODO: Parser can't handle meters as 'm' because it conflicts with the milli prefix -LENGTH_UNITS = {'nm': 1e-9, 'um': 1e-6, 'μm': 1e-6, 'mm': 1e-3, 'cm': 1e-2, 'Ang': 1e-10, 'Å': 1e-10} -MASS_UNITS = {'ng': 1e-9, 'ug': 1e-6, 'mg': 1e-3, 'g': 1e+0, 'kg': 1e+3} -VOLUME_UNITS = {'nL': 1e-9, 'uL': 1e-6, 'mL': 1e-3, 'L': 1e+0} -LENGTH_RE = '('+'|'.join(LENGTH_UNITS.keys())+')' -MASS_VOLUME_RE = '('+'|'.join(list(MASS_UNITS.keys())+list(VOLUME_UNITS.keys()))+')' -def formula_grammar(table: PeriodicTable) -> ParserElement: + +def formula_grammar(table: PeriodicTable) -> "ParserElement": """ Construct a parser for molecular formulas. @@ -736,11 +745,22 @@ def formula_grammar(table: PeriodicTable) -> ParserElement: an *element* or a list of pairs (*count, fragment*). """ + # Requires that the pyparsing module is installed. + + from pyparsing import ( + Literal, Optional, White, Regex, ZeroOrMore, OneOrMore, Forward, StringEnd, Group, + ) + # TODO: fix circular imports # This ickiness is because the formula class returned from the circular # import of fasta does not match the local formula class. from .formulas import Formula - from .util import from_subscript, from_superscript + from .util import from_subscript + from .lark_parse import LENGTH_UNITS, MASS_UNITS, VOLUME_UNITS + + LENGTH_RE = '('+'|'.join(LENGTH_UNITS.keys())+')' + MASS_VOLUME_RE = '('+'|'.join(list(MASS_UNITS.keys())+list(VOLUME_UNITS.keys()))+')' + # Recursive composite = Forward() @@ -989,8 +1009,8 @@ def convert_mixture(string, location, tokens): grammar.set_name('Chemical Formula') return grammar -_PARSER_CACHE: dict[PeriodicTable, ParserElement] = {} -def parse_formula(formula_str: str, table: PeriodicTable|None=None) -> Formula: +_PARSER_CACHE: dict[PeriodicTable, "ParserElement"] = {} +def old_parser(formula_str: str, table: PeriodicTable|None=None) -> Formula: """ Parse a chemical formula, returning a structure with elements from the given periodic table. diff --git a/periodictable/lark_parse.py b/periodictable/lark_parse.py index 581bb5c..6b2ab83 100644 --- a/periodictable/lark_parse.py +++ b/periodictable/lark_parse.py @@ -7,25 +7,29 @@ from .formulas import ( Formula, Structure, _mix_by_weight_pairs, _mix_by_volume_pairs, - VOLUME_UNITS, MASS_UNITS, LENGTH_UNITS, pretty as pretty_formula ) from .util import from_subscript, from_superscript # TODO: valence belongs to a group rather than element +# TODO: Parser can't handle meters as 'm' because it conflicts with the milli prefix +LENGTH_UNITS = {'nm': 1e-9, 'um': 1e-6, 'μm': 1e-6, 'mm': 1e-3, 'cm': 1e-2, 'Ang': 1e-10, 'Å': 1e-10} +MASS_UNITS = {'ng': 1e-9, 'ug': 1e-6, 'mg': 1e-3, 'g': 1e+0, 'kg': 1e+3} +VOLUME_UNITS = {'nL': 1e-9, 'uL': 1e-6, 'mL': 1e-3, 'L': 1e+0} + +# TODO: use grammar string directly in the sphinx/guide/formula_grammar.rst grammar = """ -start : SPACE? formula SPACE? # strip blank space from start and end +start : SPACE? formula SPACE? # strip blank space from start and end formula : compound | mixture # Mixture definitions: quantity compound // quantity compound // quantity compound # Activation only cares about total mass, so you can freely mix masses and volumes if -# you have the density for each component. Scattering cares about density of the mixture, -# which in general is different from the mixture of densities. -# To convert layers to masses for activation estimates we need density. Also need to scale by -# area to convert density and thickness to mass. Assume unit area is cm^2, so for -# example "4 (5 nm Ni // 2 mm Si)" is a 4 cm^2 wafer of nickel on silicon. If you -# were to add a polymer you would need its density: "4 (20 nm C5H10@1.2 +# you have the density for each component. For scattering you need the density of the +# mixture. When this is different from the mixture of densities use (mixture)@density. +# For thin film samples, allow stacking of layers with the thickness of each layer. +# With density for each layer the relative quantities of each element in the stack can +# be calculated. Convert to mass by multiplying by thickness (cm) and area (cm²). mixture : byamount | byvolume | byweight | layers byamount : quantity compound (MIX quantity compound)* @@ -38,49 +42,48 @@ thickness : NUMBER SPACE? LENGTH SPACE percentage : NUMBER SPACE? "%" SPACE # Allows "3 % " -# Compound definition: number group ... @ density where group is El count El count ... +# Composite: number group ... @density where group is El count El count ... +# Density applies to the entire composite, such as "NaCl + 29.2H2O @ 1.07n" +# For the density of a mixture you need parentheses: "(10 wt% NaCl // H2O)@1.07n" # FASTA sequences: (rna|dna|aa) : SEQUENCE @ density -# Density applies to the entire formula, such as "NaCl + 29.2H2O @ 1.07n" -# If you do this as a mixture you need parentheses: "(10 wt% NaCl // H2O)@1.07n" -# Note: `[token]` leaves a None placeholder in the tree, unlike `token?` +# Note: optional `[token]` leaves a None placeholder in the tree, unlike `token?` compound : (composite | fasta) [density] fasta : FASTA ":" SEQUENCE -FASTA : /[a-z]+/ # Generic "str:sequence" syntax allows better error reporting -#FASTA : /dna|rna|aa/ -SEQUENCE : /[A-Z -*]+/ composite : [NUMBER] group (SEPARATOR [NUMBER] group)* group : ((atom | isoatom | "(" formula ")") [COUNT])+ atom : SYMBOL [isotope] [valence] -isoatom : SUPERINT SYMBOL [valence] -# could list all elements, but better error reporting if element symbol lookup fails -SYMBOL : /[A-Z][a-z]*/ +isoatom : SUPERINT SYMBOL [valence] # For example ²H for deuterium isotope : "[" INTEGER "]" valence : "{" [INTEGER] CHARGE "}" | [SUPERINT] SUPERCHARGE density : SPACE? "@" SPACE? DENSITY [DENSITYMODE] -DENSITY : NUMBER # using alias DENSITY for number for better error reporting # Tokens +#FASTA : /dna|rna|aa/ # Sequence type is limited to these values but ... +FASTA : /[a-z]+/ # "str:sequence" syntax allows better error reporting +SEQUENCE : /[-A-Z *]+/ +# could list all elements, but better error reporting if element symbol lookup fails +SYMBOL : /[A-Z][a-z]*/ CHARGE : /[+]+|[-]+/ # allow valence using {++} or {--} -SUPERINT : /(\u2070|[\u00B9\u00B2\u00B3\u2074-\u2079][\u2070\u00B9\u00B2\u00B3\u2074-\u2079]*)/ -SUPERCHARGE: /\u207A+|\u207B+/ # Allow Ca++ and Cl- using superscript + and - -DENSITYMODE: /[ni]/ +DENSITY : NUMBER # using alias DENSITY for number for better error reporting +DENSITYMODE: /[ni]/ # n=natural density, i=isotopic density MIX : SPACE? "//" SPACE? -# maybe drop "wt%" and "vol%" WEIGHTPCT : /%w((eigh)?t)?/ | /w((eigh)?t)?%/ | /%m(ass)?/ | /m(ass)?%/ VOLUMEPCT : /%v(ol(ume)?)?/ | /v(ol(ume)?)?%/ MASS : "kg" | "g" | "mg" | "ug" | "μg" | "ng" VOLUME : "L" | "mL" | "uL" | "μL" | "nL" LENGTH : "cm" | "mm" | "um" | "μm" | "nm" | "Ang" | "Å" +COUNT : NUMBER | SUBNUM # atom counts can be normal numbers or unicode subscripts SEPARATOR : SPACE? /[+•·]/ SPACE? | SPACE SPACE : /[ \\t\\n\\r]+/ -COUNT : NUMBER | SUBNUM # atom counts can be normal numbers or unicode subscripts NUMBER : INTEGER | FRACTION INTEGER : /[1-9][0-9]*/ FRACTION : /([1-9][0-9]*|0)?[.][0-9]*/ # allow all floats? SUBNUM : SUBINT | SUBFRAC SUBINT : /(\u2080|[\u2081-\u2089][\u2080-\u2089]*)/ SUBFRAC : /(\u2080|[\u2081-\u2089][\u2080-\u2089]*|)([.][\u2080-\u2089]*)/ +SUPERINT : /(\u2070|[\u00B9\u00B2\u00B3\u2074-\u2079][\u2070\u00B9\u00B2\u00B3\u2074-\u2079]*)/ +SUPERCHARGE: /\u207A+|\u207B+/ # Allow Ca++ and Cl- using superscript + and - """ # propagate_positions saves start_pos and end_pos for each rule as well as each terminal. @@ -356,17 +359,16 @@ def composite(self, tokens) -> Structure: return pairs def fasta(self, tokens) -> Structure: - """ + r""" Returns the formula corresponding to the FASTA sequence, with the natural density set. Labile hydrogen use H[1] in the formula. The extra level of nesting in the return value is so that the fasta structure is like a composite with a single group containing a nested formula. - Transform: [ /aa|dna|rna/, /[A-Z -*]+/ ] => (1, ((1, formula),)) + Transform: [ 'aa|dna|rna', '[-A-Z \*]+' ] => (1, ((1, formula),)) - Example dna:CAGT: ['dna', 'CAGT'] - => ((1, ((1, formula('C39H37H[1]10N15O25P4')),)),) + Example: dna:CAGT: ['dna', 'CAGT'] x=> ((1, ((1, formula('C39H37H[1]10N15O25P4')),)),) """ # TODO: fasta is ignoring table when parsing # TODO: avoid circular imports @@ -596,7 +598,7 @@ def layers(self, tokens) -> Formula: total = sum(values) percent = [(m/total)*100 for m in values] formula = _mix_by_volume_pairs(zip(tokens[1::2], percent)) - formula.thickness = total + formula.thickness = 100*total # convert meters to centimeters for cgs units return formula def mixture(self, tokens) -> Formula: @@ -641,14 +643,13 @@ def start(self, tokens) -> Formula: """ Return the final formula, with the original text attached. - Sets formula.source to 'parse string' before returning. + Sets formula.name to the parser input string before returning. Transform: [formula] => formula """ formula = tokens[0] - # TODO: add the source string to the formula class attributes # Remember the string which was parsed - formula.source = self._context + formula.name = self._context return formula # TODO: if the next character is ":" then report error as bad fasta sequence type @@ -830,7 +831,7 @@ def parse_formula(formula_str: str, table: PeriodicTable|None=None) -> Formula: """ def check(): - from periodictable.formulas import parse_formula as old_parser + from periodictable.formulas import old_parser for line in examples.split('\n'): formula = line.split('#')[0] @@ -868,7 +869,11 @@ def main(): if len(sys.argv) > 1: for arg in sys.argv[1:]: - print(parse_formula(arg)) + formula = parse_formula(arg) + mass = f" {formula.total_mass:.4g} g" if formula.total_mass else "" + density = f"@{formula.density:.4g}" if formula.density else "" + thickness = f" {10*formula.thickness:.4g} mm" if formula.thickness else "" + print(f"{formula}{density}{mass}{thickness}") else: check() diff --git a/pyproject.toml b/pyproject.toml index 1b651ef..c4af2ef 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,8 @@ ] license = { file = "LICENSE.txt" } dependencies = [ - "pyparsing>=3.0.0", "numpy", + "numpy", + "lark", ] classifiers = [ @@ -29,7 +30,7 @@ # Matplotlib and uncertainties are optional packages, used for making # plots in the docs and generating neutron data tables for the web. # mypy checks all code so they are needed for testing as well. - optional = ["uncertainties", "matplotlib"] + optional = ["uncertainties", "matplotlib", "pytparsing>=3.0.0"] docs = ["sphinx", {include-group = "optional"}] test = ["pytest", "pytest-cov", "pytest-mypy", {include-group = "optional"}] dev = [ diff --git a/test/test_formulas.py b/test/test_formulas.py index eda94de..4eb2b9e 100644 --- a/test/test_formulas.py +++ b/test/test_formulas.py @@ -14,12 +14,19 @@ def check_parse_fails(s): raise Exception(f'formula("{s}") should fail to parse') def test(): + # CaCO3(H2O)6 is a tuple of (count, atom) followed by (6, H2O) + # CaCO3+6H2O is ((1, CaCO3), (6, H2O)) ikaite = formula() - # Note: this should be a tuple of tuples ikaite.structure = ((1, Ca), (1, C), (3, O), (6, ((2, H), (1, O)))) + ikaite.name = "CaCO3(H2O)6" + ikaite_grouped = formula() + ikaite_grouped.structure = ((1, ((1, Ca), (1, C), (3, O))), (6, ((2, H), (1, O)))) + ikaite_grouped.name = "CaCO3+6H2O" # Test print assert str(ikaite) == "CaCO3(H2O)6" + assert str(ikaite_grouped) == "CaCO3(H2O)6" + # TODO: parsing a printed structure should produce the same structure # Test constructors assert ikaite == formula([(1, Ca), (1, C), (3, O), (6, [(2, H), (1, O)])]) @@ -31,9 +38,9 @@ def test(): assert formula("Ca") == formula([(1, Ca)]) assert formula("Ca") == formula(Ca) assert formula("CaCO3") == formula([(1, Ca), (1, C), (3, O)]) - assert ikaite == formula("CaCO3+6H2O") - assert ikaite == formula("(CaCO3+6H2O)1") - assert ikaite == formula("CaCO3 6H2O") + assert ikaite_grouped == formula("CaCO3+6H2O") + assert ikaite_grouped == formula("(CaCO3+6H2O)1") + assert ikaite_grouped == formula("CaCO3 6H2O") assert ikaite == formula("CaCO3(H2O)6") assert ikaite == formula("(CaCO3(H2O)6)1") assert ikaite.hill == formula("CCaO3(H2O)6").hill @@ -43,7 +50,7 @@ def test(): # Unicode, latex and html subscripts assert formula([(0.75, Fe), (0.25, Ni)]) == formula("Fe₀.₇₅Ni₀.₂₅") assert ikaite == formula("CaCO₃(H₂O)₆") - assert ikaite == formula("CaCO₃6H₂O") # with subscripts we know it isn't O36 + assert ikaite_grouped == formula("CaCO₃ 6H₂O") # with subscripts we know it isn't O36 assert pretty(ikaite, 'unicode') == "CaCO₃(H₂O)₆" assert pretty(ikaite, 'html') == "CaCO3(H2O)6" assert pretty(ikaite, 'latex') == "CaCO$_{3}$(H$_{2}$O)$_{6}$" @@ -116,14 +123,15 @@ def test(): # Check that names work permalloy = formula('Ni8Fe2', 8.692, name='permalloy') - assert str(permalloy) == 'permalloy' + assert str(permalloy) == 'Ni8Fe2' + assert permalloy.name == 'permalloy' # Check that get/restore state works assert deepcopy(permalloy).__dict__ == permalloy.__dict__ # Check that copy constructor works - #print permalloy.__dict__ - #print formula(permalloy).__dict__ + # print(permalloy.__dict__) + # print(formula(permalloy).__dict__) assert formula(permalloy).__dict__ == permalloy.__dict__ assert formula('Si', name='Silicon').__dict__ != formula('Si').__dict__ From 42278e633b372dcb840c4add2a6e07e4c0730da0 Mon Sep 17 00:00:00 2001 From: Paul Kienzle Date: Thu, 21 May 2026 17:22:40 -0400 Subject: [PATCH 12/19] remove pyparsing dependency --- ChangeLog.rst | 7 + periodictable/formulas.py | 299 ------------------------------------ periodictable/lark_parse.py | 4 - pyproject.toml | 2 +- 4 files changed, 8 insertions(+), 304 deletions(-) diff --git a/ChangeLog.rst b/ChangeLog.rst index 8a7efa9..5ea8a37 100644 --- a/ChangeLog.rst +++ b/ChangeLog.rst @@ -23,6 +23,13 @@ Known issues Change history ============== +2026-05-21 R2.2.0 +----------------- + +Modified: + +* Use lark for better error reporting from the formula parser + 2026-02-27 R2.1.0 ----------------- diff --git a/periodictable/formulas.py b/periodictable/formulas.py index e752e3e..8c16a1b 100644 --- a/periodictable/formulas.py +++ b/periodictable/formulas.py @@ -723,305 +723,6 @@ def _isotope_substitution(compound: "Formula", source: Atom, target: Atom, porti density = compound.density return formula(atoms, density=density) -if TYPE_CHECKING: - from pyparsing import ParserElement - -# TODO: Grammar should be independent of table - -def formula_grammar(table: PeriodicTable) -> "ParserElement": - """ - Construct a parser for molecular formulas. - - :Parameters: - - *table* = None : PeriodicTable - If table is specified, then elements and their associated fields - will be chosen from that periodic table rather than the default. - - :Returns: - *parser* : pyparsing.ParserElement. - The ``parser.parse_string()`` method returns a list of - pairs (*count, fragment*), where fragment is an *isotope*, - an *element* or a list of pairs (*count, fragment*). - - """ - # Requires that the pyparsing module is installed. - - from pyparsing import ( - Literal, Optional, White, Regex, ZeroOrMore, OneOrMore, Forward, StringEnd, Group, - ) - - # TODO: fix circular imports - # This ickiness is because the formula class returned from the circular - # import of fasta does not match the local formula class. - from .formulas import Formula - from .util import from_subscript - from .lark_parse import LENGTH_UNITS, MASS_UNITS, VOLUME_UNITS - - LENGTH_RE = '('+'|'.join(LENGTH_UNITS.keys())+')' - MASS_VOLUME_RE = '('+'|'.join(list(MASS_UNITS.keys())+list(VOLUME_UNITS.keys()))+')' - - - # Recursive - composite = Forward() - mixture = Forward() - - # whitespace and separators - space = Optional(White().suppress()) - separator = space+Literal('+').suppress()+space - - # Lookup the element in the element table - symbol = Regex("[A-Z][a-z]?") - symbol.set_parse_action(lambda s, l, t: table.symbol(t[0])) - - # Translate isotope - openiso = Literal('[').suppress() - closeiso = Literal(']').suppress() - isotope = Optional(~White()+openiso+Regex("[1-9][0-9]*")+closeiso, - default='0') - isotope.set_parse_action(lambda s, l, t: int(t[0]) if t[0] else 0) - - # Translate ion - openion = Literal('{').suppress() - closeion = Literal('}').suppress() - ion = Optional(~White() +openion +Regex("([1-9][0-9]*)?[+-]") +closeion, - default='0+') - ion.set_parse_action(lambda s, l, t: int(t[0][-1]+(t[0][:-1] if len(t[0]) > 1 else '1'))) - - # Translate counts - # TODO: regex should reject a bare '.' if we want to allow dots between formula parts - fract = Regex("(0|[1-9][0-9]*|)([.][0-9]*)") - fract.set_parse_action(lambda s, l, t: float(t[0]) if t[0] else 1) - whole = Regex("(0|[1-9][0-9]*)") - whole.set_parse_action(lambda s, l, t: int(t[0]) if t[0] else 1) - number = Optional(~White()+(fract|whole), default=1) - # TODO use unicode ₀₁₉ in the code below? - sub_fract = Regex("(\u2080|[\u2081-\u2089][\u2080-\u2089]*|)([.][\u2080-\u2089]*)") - sub_fract.set_parse_action(lambda s, l, t: float(from_subscript(t[0])) if t[0] else 1) - sub_whole = Regex("(\u2080|[\u2081-\u2089][\u2080-\u2089]*)") - sub_whole.set_parse_action(lambda s, l, t: int(from_subscript(t[0])) if t[0] else 1) - sub_count = Optional(~White()+(fract|whole|sub_fract|sub_whole), default=1) - - # Fasta code - fasta = Regex("aa|rna|dna") + Literal(":").suppress() + Regex("[A-Z *-]+") - def convert_fasta(string, location, tokens): - #print("fasta", string, location, tokens) - # TODO: fasta is ignoring table when parsing - # TODO: avoid circular imports - # TODO: support other biochemicals (carbohydrate residues, lipids) - from . import fasta - seq_type, seq = tokens - if seq_type not in fasta.CODE_TABLES: - raise ValueError(f"Invalid fasta sequence type '{seq_type}:'") - seq = fasta.Sequence(name=None, sequence=seq, type=seq_type) - return seq.labile_formula - fasta.set_parse_action(convert_fasta) - - # Convert symbol, isotope, ion, count to (count, isotope) - element = symbol+isotope+ion+sub_count - def convert_element(string, location, tokens): - """interpret string as element""" - #print "convert_element received", tokens - symbol, isotope, ion, count = tokens[0:4] - if isotope != 0: - symbol = symbol[isotope] - if ion != 0: - symbol = symbol.ion[ion] - return (count, symbol) - element.set_parse_action(convert_element) - - # Convert "count elements" to a pair - implicit_group = number+OneOrMore(element) - def convert_implicit(string, location, tokens): - """convert count followed by fragment""" - #print "implicit", tokens - count = tokens[0] - fragment = tokens[1:] - return fragment if count == 1 else (count, fragment) - implicit_group.set_parse_action(convert_implicit) - - # Convert "(composite) count" to a pair - opengrp = space + Literal('(').suppress() + space - closegrp = space + Literal(')').suppress() + space - explicit_group = opengrp + composite + closegrp + sub_count - def convert_explicit(string, location, tokens): - """convert (fragment)count""" - #print "explicit", tokens - count = tokens[-1] - fragment = tokens[:-1] - return fragment if count == 1 else (count, fragment) - explicit_group.set_parse_action(convert_explicit) - - # Build composite from a set of groups - group = implicit_group | explicit_group - implicit_separator = separator | space - composite << group + ZeroOrMore(implicit_separator + group) - - density = Literal('@').suppress() + number + Optional(Regex("[ni]"), default='i') - compound = (composite|fasta) + Optional(density, default=None) - def convert_compound(string, location, tokens): - """convert material @ density or fasta @ density""" - # Messiness: both composite and density can be one or more tokens - # If density is missing then it is None, otherwise it is count + [ni] - # Compound can be a sequence of (count, fragment) pairs, or if it is - # a fasta sequence it may already be a formula. - material = tokens[:-1] if tokens[-1] is None else tokens[:-2] - #print("compound", material, type(material[0]), len(material)) - if len(material) == 1 and isinstance(material[0], Formula): - formula = material[0] - else: - #print("unbundling material", material) - formula = Formula(structure=_immutable(material)) - density, form = (None, None) if tokens[-1] is None else tokens[-2:] - #if density is None and formula.density is None: - # # Estimate density from covalent radii and a 0.54 packing factor - # mass = formula.molecular_mass - # volume = formula.volume(packing_factor=0.54, H_radius=1.15) - # density, form = mass/volume, 'n' - # print(f"estimating density as {mass/volume=:.3f}") - if form == 'n': - formula.natural_density = density - elif form == 'i': - formula.density = density - #print("compound", formula, f"{formula.density=:.3f}") - return formula - compound.set_parse_action(convert_compound) - - partsep = space + Literal('//').suppress() + space - percent = Literal('%').suppress() - weight = Regex("(w((eigh)?t)?|m(ass)?)").suppress() - volume = Regex("v(ol(ume)?)?").suppress() - weight_percent = (percent + weight) | (weight + percent) + space - volume_percent = (percent + volume) | (volume + percent) + space - mixture_by_weight = (number + weight_percent + mixture - + ZeroOrMore(partsep+number+(weight_percent|percent)+mixture) - + Optional(partsep + mixture, default=None)) - def _parts_by_weight_vol(tokens): - #print("by weight or volume", tokens) - if tokens[-1] is None: - piece = tokens[1:-1:2] - fract = [float(v) for v in tokens[:-1:2]] - if abs(sum(fract) - 100) > 1e-12: - raise ValueError(f"Formula percentages must sum to 100%, not {sum(fract)}") - else: - piece = tokens[1:-1:2] + [tokens[-1]] - fract = [float(v) for v in tokens[:-1:2]] - fract.append(100-sum(fract)) - if fract[-1] < 0: - raise ValueError("Formula percentages must sum to less than 100%") - #print piece, fract - if len(piece) != len(fract): - raise ValueError("Missing base component of mixture") - return piece, fract - def convert_by_weight(string, location, tokens): - """convert mixture by wt% or mass%""" - piece, fract = _parts_by_weight_vol(tokens) - return _mix_by_weight_pairs(zip(piece, fract)) - mixture_by_weight.set_parse_action(convert_by_weight) - - mixture_by_volume = (number + volume_percent + mixture - + ZeroOrMore(partsep+number+(volume_percent|percent)+mixture) - + Optional(partsep + mixture, default=None)) - def convert_by_volume(string, location, tokens): - """convert mixture by vol%""" - piece, fract = _parts_by_weight_vol(tokens) - return _mix_by_volume_pairs(zip(piece, fract)) - mixture_by_volume.set_parse_action(convert_by_volume) - - mixture_by_layer = Forward() - layer_thick = Group(number + Regex(LENGTH_RE) + space) - layer_part = (layer_thick + mixture) | (opengrp + mixture_by_layer + closegrp + sub_count) - mixture_by_layer << layer_part + ZeroOrMore(partsep + layer_part) - def convert_by_layer(string, location, tokens): - """convert layer thickness '# nm material'""" - if len(tokens) < 2: - return tokens - piece = [] - fract = [] - for p1, p2 in zip(tokens[0::2], tokens[1::2]): - if isinstance(p1, Formula): - f = p1.thickness * float(p2) - p = p1 - else: - f = float(p1[0]) * LENGTH_UNITS[p1[1]] - p = p2 - piece.append(p) - fract.append(f) - total = sum(fract) - vfract = [(v/total)*100 for v in fract] - result = _mix_by_volume_pairs(zip(piece, vfract)) - result.thickness = total - return result - mixture_by_layer.set_parse_action(convert_by_layer) - - mixture_by_absmass = Forward() - absmass_mass = Group(number + Regex(MASS_VOLUME_RE) + space) - absmass_part = (absmass_mass + mixture) | (opengrp + mixture_by_absmass + closegrp + sub_count) - mixture_by_absmass << absmass_part + ZeroOrMore(partsep + absmass_part) - def convert_by_absmass(string, location, tokens): - """convert mass '# mg material'""" - if len(tokens) < 2: - return tokens - piece = [] - fract = [] - for p1, p2 in zip(tokens[0::2], tokens[1::2]): - if isinstance(p1, Formula): - p = p1 - f = p1.total_mass * float(p2) - else: - p = p2 - value = float(p1[0]) - if p1[1] in VOLUME_UNITS: - # convert to volume in liters to mass in grams before mixing - if p.density is None: - raise ValueError("Need the mass density of "+str(p)) - f = value * VOLUME_UNITS[p1[1]] * 1000.*p.density - else: - f = value * MASS_UNITS[p1[1]] - piece.append(p) - fract.append(f) - - total = sum(fract) - mfract = [(m/total)*100 for m in fract] - result = _mix_by_weight_pairs(zip(piece, mfract)) - result.total_mass = total - return result - mixture_by_absmass.set_parse_action(convert_by_absmass) - - ungrouped_mixture = (mixture_by_weight | mixture_by_volume - | mixture_by_layer | mixture_by_absmass) - grouped_mixture = opengrp + ungrouped_mixture + closegrp + Optional(density, default=None) - def convert_mixture(string, location, tokens): - """convert (mixture) @ density""" - formula = tokens[0] - if tokens[-1] == 'n': - formula.natural_density = tokens[-2] - elif tokens[-1] == 'i': - formula.density = tokens[-2] - # elif tokens[-1] is None - return formula - grouped_mixture.set_parse_action(convert_mixture) - - mixture << (compound | grouped_mixture) - formula = (compound | ungrouped_mixture | grouped_mixture) - grammar = Optional(formula, default=Formula()) + StringEnd() - - grammar.set_name('Chemical Formula') - return grammar - -_PARSER_CACHE: dict[PeriodicTable, "ParserElement"] = {} -def old_parser(formula_str: str, table: PeriodicTable|None=None) -> Formula: - """ - Parse a chemical formula, returning a structure with elements from the - given periodic table. - """ - table = default_table(table) - if table not in _PARSER_CACHE: - _PARSER_CACHE[table] = formula_grammar(table) - parser = _PARSER_CACHE[table] - #print(parser) - return parser.parse_string(formula_str)[0] - def _count_atoms(seq: Structure) -> dict[Atom, float]: """ Traverse formula structure, counting the total number of atoms. diff --git a/periodictable/lark_parse.py b/periodictable/lark_parse.py index 6b2ab83..d70e45d 100644 --- a/periodictable/lark_parse.py +++ b/periodictable/lark_parse.py @@ -831,8 +831,6 @@ def parse_formula(formula_str: str, table: PeriodicTable|None=None) -> Formula: """ def check(): - from periodictable.formulas import old_parser - for line in examples.split('\n'): formula = line.split('#')[0] bad = line.startswith('!') @@ -844,9 +842,7 @@ def check(): else: print(f"*** {line}") try: - # Toggle the following to test pyparsing vs lark tree = parse_formula(formula) - #tree = old_parser(formula) if "##" not in line else "!!! pyparsing fails" density = getattr(tree, 'density', None) density_str = f" @ {density:.2f}" if density else "" mode = 'unicode' # unicode latex html plain diff --git a/pyproject.toml b/pyproject.toml index c4af2ef..120f656 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,7 +30,7 @@ # Matplotlib and uncertainties are optional packages, used for making # plots in the docs and generating neutron data tables for the web. # mypy checks all code so they are needed for testing as well. - optional = ["uncertainties", "matplotlib", "pytparsing>=3.0.0"] + optional = ["uncertainties", "matplotlib"] docs = ["sphinx", {include-group = "optional"}] test = ["pytest", "pytest-cov", "pytest-mypy", {include-group = "optional"}] dev = [ From 1ccb4a3e1d975ef5cd0424637eb832f8edfd5ed0 Mon Sep 17 00:00:00 2001 From: Paul Kienzle Date: Fri, 22 May 2026 11:58:41 -0400 Subject: [PATCH 13/19] tweak formula docs --- doc/sphinx/guide/formula_grammar.rst | 74 +++++++++++++++++++--------- periodictable/lark_parse.py | 44 +++++++++-------- 2 files changed, 73 insertions(+), 45 deletions(-) diff --git a/doc/sphinx/guide/formula_grammar.rst b/doc/sphinx/guide/formula_grammar.rst index da694cd..014b795 100644 --- a/doc/sphinx/guide/formula_grammar.rst +++ b/doc/sphinx/guide/formula_grammar.rst @@ -159,9 +159,20 @@ The grammar used for parsing formula strings is the following: :: + # formula: composite @ density | str:sequence @ density | mixture formula : compound | mixture + compound : (composite | fasta) [density] + # Density applies to the entire composite, such as "NaCl + 29.2H2O @ 1.07n" + # For the density of a mixture you need parentheses: "(10 wt% NaCl // H2O)@1.07n" + + # Activation only cares about total mass, so you can freely mix masses and volumes if + # you have the density for each component. For scattering you need the density of the + # mixture. When this is different from the mixture of densities use (mixture)@density. + # For thin film samples, allow stacking of layers with the thickness of each layer. + # With density for each layer the relative quantities of each element in the stack can + # be calculated. Convert to mass by multiplying density by thickness (cm) and area (cm²). - # Mixture definitions: quantity compound // quantity compound // quantity compound + # mixture: quantity compound // quantity compound // ... mixture : byamount | byvolume | byweight | layers byamount : quantity compound (MIX quantity compound)* byvolume : volumepct compound (MIX percentage compound)* MIX compound @@ -172,13 +183,20 @@ The grammar used for parsing formula strings is the following: volumepct : NUMBER SPACE? VOLUMEPCT SPACE thickness : NUMBER SPACE? LENGTH SPACE percentage : NUMBER SPACE? "%" SPACE # Allows "3 % " + MIX : SPACE? "//" SPACE? + WEIGHTPCT : /%w((eigh)?t)?/ | /w((eigh)?t)?%/ | /%m(ass)?/ | /m(ass)?%/ + VOLUMEPCT : /%v(ol(ume)?)?/ | /v(ol(ume)?)?%/ + MASS : "kg" | "g" | "mg" | "ug" | "μg" | "ng" + VOLUME : "L" | "mL" | "uL" | "μL" | "nL" + LENGTH : "cm" | "mm" | "um" | "μm" | "nm" | "Ang" | "Å" - # Compound definition: number group ... @density where group is El count El count ... - # FASTA sequences: (rna|dna|aa) : SEQUENCE @ density - # Density applies to the entire formula, such as "NaCl + 29.2H2O @ 1.07n" - # For the density of a mixture you need parentheses: "(10 wt% NaCl // H2O)@1.07n" - compound : (composite | fasta) [density] + # FASTA sequence: (rna|dna|aa):SEQUENCE @ density fasta : FASTA ":" SEQUENCE + FASTA : /[a-z]+/ # str:sequence reports better errors than /dna|rna|aa/:sequence + SEQUENCE : /[-A-Z *]+/ + + # composite: number group number group ... @density + # group: El count El count ... composite : [NUMBER] group (SEPARATOR [NUMBER] group)* group : ((atom | isoatom | "(" formula ")") [COUNT])+ atom : SYMBOL [isotope] [valence] @@ -186,23 +204,23 @@ The grammar used for parsing formula strings is the following: isotope : "[" INTEGER "]" valence : "{" [INTEGER] CHARGE "}" | [SUPERINT] SUPERCHARGE density : SPACE? "@" SPACE? DENSITY [DENSITYMODE] - - # Tokens - #FASTA : /dna|rna|aa/ # Sequence type is limited to these values but ... - FASTA : /[a-z]+/ # "type:sequence" syntax allows better error reporting - SEQUENCE : /[-A-Z *]+/ # could list all elements, but better error reporting if element symbol lookup fails SYMBOL : /[A-Z][a-z]*/ CHARGE : /[+]+|[-]+/ # allow valence using {++} or {--} + SUPERCHARGE: /\u207A+|\u207B+/ # unicode valence such as Ca⁺⁺ and O²⁻ DENSITY : NUMBER # using alias DENSITY for number for better error reporting DENSITYMODE: /[ni]/ # n=natural density, i=isotopic density - MIX : SPACE? "//" SPACE? - WEIGHTPCT : /%w((eigh)?t)?/ | /w((eigh)?t)?%/ | /%m(ass)?/ | /m(ass)?%/ - VOLUMEPCT : /%v(ol(ume)?)?/ | /v(ol(ume)?)?%/ - MASS : "kg" | "g" | "mg" | "ug" | "μg" | "ng" - VOLUME : "L" | "mL" | "uL" | "μL" | "nL" - LENGTH : "cm" | "mm" | "um" | "μm" | "nm" | "Ang" | "Å" COUNT : NUMBER | SUBNUM # atom counts can be normal numbers or unicode subscripts + SEPARATOR : SPACE? /[+•·]/ SPACE? | SPACE # For example, CaCO₃·6H₂O + + SPACE : /[ \\t\\n\\r]+/ + NUMBER : INTEGER | FRACTION + INTEGER : /[1-9][0-9]*/ + FRACTION : /([1-9][0-9]*|0)?[.][0-9]*/ # allow all floats? + SUBNUM : SUBINT | SUBFRAC + SUBINT : /(\u2080|[\u2081-\u2089][\u2080-\u2089]*)/ + SUBFRAC : /(\u2080|[\u2081-\u2089][\u2080-\u2089]*|)([.][\u2080-\u2089]*)/ + SUPERINT : /(\u2070|[\u00B9\u00B2\u00B3\u2074-\u2079][\u2070\u00B9\u00B2\u00B3\u2074-\u2079]*)/ Formulas can also be constructed from atoms or other formulas: @@ -281,18 +299,26 @@ following is a 2:1 mixture of water and heavy water: >>> H2O = formula('H2O',natural_density=1) >>> D2O = formula('D2O',natural_density=1) >>> mix = mix_by_volume(H2O,2,D2O,1) - >>> print(f"{mix} {mix.density:.4g}") - (H2O)2D2O 1.037 + >>> print(f"{mix} @ {mix.density:.4g}") + (H2O)2D2O @ 1.037 -Note that this is different from a 2:1 mixture by weight: +This is different from a 2:1 mixture by weight: >>> mix = mix_by_weight(H2O,2,D2O,1) - >>> print(f"{mix} {mix.density:.4g}") - (H2O)2.22339D2O 1.035 + >>> print(f"{mix} @ {mix.density:.4g}") + (H2O)2.22339D2O @ 1.035 Except in the simplest of cases, the density of the mixture cannot be -computed from the densities of the components, and the resulting density -should be set explicitly. +computed from the densities of the components. Even when the component +density is known the resulting density should be set explicitly: + + >>> mix = mix_by_weight("NaCl@2.17", 0.1, "H2O@1", 0.9) + >>> print(f"{mix} @ {mix.density:.4g}") + NaCl(H2O)29.1956 @ 1.057 + >>> mix = mix_by_weight("NaCl@2.17", 0.1, "H2O@1", 0.9, density=1.07) + >>> print(f"{mix} @ {mix.density:.4g}") + NaCl(H2O)29.1956 @ 1.07 + Derived values -------------- diff --git a/periodictable/lark_parse.py b/periodictable/lark_parse.py index d70e45d..6a696e8 100644 --- a/periodictable/lark_parse.py +++ b/periodictable/lark_parse.py @@ -19,18 +19,24 @@ VOLUME_UNITS = {'nL': 1e-9, 'uL': 1e-6, 'mL': 1e-3, 'L': 1e+0} # TODO: use grammar string directly in the sphinx/guide/formula_grammar.rst +# Any changes to the grammar below should be copied to formula_grammar.rst grammar = """ start : SPACE? formula SPACE? # strip blank space from start and end + +# formula: composite @ density | str:sequence @ density | mixture formula : compound | mixture +compound : (composite | fasta) [density] +# Density applies to the entire composite, such as "NaCl + 29.2H2O @ 1.07n" +# For the density of a mixture you need parentheses: "(10 wt% NaCl // H2O)@1.07n" -# Mixture definitions: quantity compound // quantity compound // quantity compound # Activation only cares about total mass, so you can freely mix masses and volumes if # you have the density for each component. For scattering you need the density of the # mixture. When this is different from the mixture of densities use (mixture)@density. # For thin film samples, allow stacking of layers with the thickness of each layer. # With density for each layer the relative quantities of each element in the stack can -# be calculated. Convert to mass by multiplying by thickness (cm) and area (cm²). +# be calculated. Convert to mass by multiplying density by thickness (cm) and area (cm²). +# mixture: quantity compound // quantity compound // quantity compound mixture : byamount | byvolume | byweight | layers byamount : quantity compound (MIX quantity compound)* byvolume : volumepct compound (MIX percentage compound)* MIX compound @@ -41,14 +47,21 @@ volumepct : NUMBER SPACE? VOLUMEPCT SPACE thickness : NUMBER SPACE? LENGTH SPACE percentage : NUMBER SPACE? "%" SPACE # Allows "3 % " +MIX : SPACE? "//" SPACE? +WEIGHTPCT : /%w((eigh)?t)?/ | /w((eigh)?t)?%/ | /%m(ass)?/ | /m(ass)?%/ +VOLUMEPCT : /%v(ol(ume)?)?/ | /v(ol(ume)?)?%/ +MASS : "kg" | "g" | "mg" | "ug" | "μg" | "ng" +VOLUME : "L" | "mL" | "uL" | "μL" | "nL" +LENGTH : "cm" | "mm" | "um" | "μm" | "nm" | "Ang" | "Å" -# Composite: number group ... @density where group is El count El count ... -# Density applies to the entire composite, such as "NaCl + 29.2H2O @ 1.07n" -# For the density of a mixture you need parentheses: "(10 wt% NaCl // H2O)@1.07n" -# FASTA sequences: (rna|dna|aa) : SEQUENCE @ density -# Note: optional `[token]` leaves a None placeholder in the tree, unlike `token?` -compound : (composite | fasta) [density] +# FASTA sequence: (rna|dna|aa):SEQUENCE @ density fasta : FASTA ":" SEQUENCE +FASTA : /[a-z]+/ # str:sequence reports better errors than /dna|rna|aa/:sequence +SEQUENCE : /[-A-Z *]+/ + +# composite: number group number group ... @density +# group: El count El count ... +# Note: optional `[token]` leaves a None placeholder in the tree, unlike `token?` composite : [NUMBER] group (SEPARATOR [NUMBER] group)* group : ((atom | isoatom | "(" formula ")") [COUNT])+ atom : SYMBOL [isotope] [valence] @@ -56,25 +69,15 @@ isotope : "[" INTEGER "]" valence : "{" [INTEGER] CHARGE "}" | [SUPERINT] SUPERCHARGE density : SPACE? "@" SPACE? DENSITY [DENSITYMODE] - -# Tokens -#FASTA : /dna|rna|aa/ # Sequence type is limited to these values but ... -FASTA : /[a-z]+/ # "str:sequence" syntax allows better error reporting -SEQUENCE : /[-A-Z *]+/ # could list all elements, but better error reporting if element symbol lookup fails SYMBOL : /[A-Z][a-z]*/ CHARGE : /[+]+|[-]+/ # allow valence using {++} or {--} +SUPERCHARGE: /\u207A+|\u207B+/ # unicode valence such as Ca⁺⁺ and O²⁻ DENSITY : NUMBER # using alias DENSITY for number for better error reporting DENSITYMODE: /[ni]/ # n=natural density, i=isotopic density -MIX : SPACE? "//" SPACE? -WEIGHTPCT : /%w((eigh)?t)?/ | /w((eigh)?t)?%/ | /%m(ass)?/ | /m(ass)?%/ -VOLUMEPCT : /%v(ol(ume)?)?/ | /v(ol(ume)?)?%/ -MASS : "kg" | "g" | "mg" | "ug" | "μg" | "ng" -VOLUME : "L" | "mL" | "uL" | "μL" | "nL" -LENGTH : "cm" | "mm" | "um" | "μm" | "nm" | "Ang" | "Å" COUNT : NUMBER | SUBNUM # atom counts can be normal numbers or unicode subscripts +SEPARATOR : SPACE? /[+•·]/ SPACE? | SPACE # For example, CaCO₃·6H₂O -SEPARATOR : SPACE? /[+•·]/ SPACE? | SPACE SPACE : /[ \\t\\n\\r]+/ NUMBER : INTEGER | FRACTION INTEGER : /[1-9][0-9]*/ @@ -83,7 +86,6 @@ SUBINT : /(\u2080|[\u2081-\u2089][\u2080-\u2089]*)/ SUBFRAC : /(\u2080|[\u2081-\u2089][\u2080-\u2089]*|)([.][\u2080-\u2089]*)/ SUPERINT : /(\u2070|[\u00B9\u00B2\u00B3\u2074-\u2079][\u2070\u00B9\u00B2\u00B3\u2074-\u2079]*)/ -SUPERCHARGE: /\u207A+|\u207B+/ # Allow Ca++ and Cl- using superscript + and - """ # propagate_positions saves start_pos and end_pos for each rule as well as each terminal. From 40a1b6ceb076206fce54f2650f132ca9c7247d09 Mon Sep 17 00:00:00 2001 From: Paul Kienzle Date: Fri, 22 May 2026 12:09:55 -0400 Subject: [PATCH 14/19] improve output of error handling demo python -m periodictable.lark_parse --- periodictable/lark_parse.py | 1 + 1 file changed, 1 insertion(+) diff --git a/periodictable/lark_parse.py b/periodictable/lark_parse.py index 6a696e8..0b19aa6 100644 --- a/periodictable/lark_parse.py +++ b/periodictable/lark_parse.py @@ -839,6 +839,7 @@ def check(): if bad: formula = formula[1:] if formula: + print() if bad: print(f"!!! {line[1:]}") else: From abe50587849a24cbce44b4230acd2188c80ce44f Mon Sep 17 00:00:00 2001 From: Paul Kienzle Date: Fri, 22 May 2026 12:22:47 -0400 Subject: [PATCH 15/19] attempt to fix missing lark on CI --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 120f656..9182798 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,6 +37,7 @@ {include-group = "build"}, {include-group = "docs"}, {include-group = "test"}, + "periodictable", ] [project.urls] From e2ce8f7b2c968c0d19aa7d82b86ac3ada28e1f1b Mon Sep 17 00:00:00 2001 From: Paul Kienzle Date: Fri, 22 May 2026 12:24:30 -0400 Subject: [PATCH 16/19] attempt to fix missing lark on CI --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 9182798..b40f53e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ {include-group = "build"}, {include-group = "docs"}, {include-group = "test"}, - "periodictable", + "numpy", "lark", ] [project.urls] From cc49581efe6eacdbd5f8ec1edd9c29f3b64759dc Mon Sep 17 00:00:00 2001 From: Paul Kienzle Date: Fri, 22 May 2026 12:26:36 -0400 Subject: [PATCH 17/19] attempt to fix missing lark on CI --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index b40f53e..a327bf0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,6 +37,7 @@ {include-group = "build"}, {include-group = "docs"}, {include-group = "test"}, + # TODO: Shouldn't have to copy base dependencies here...is there a better way? "numpy", "lark", ] From e3308e643e7b9d5319fc576ee18317c1c363b277 Mon Sep 17 00:00:00 2001 From: Paul Kienzle Date: Fri, 29 May 2026 14:02:43 -0400 Subject: [PATCH 18/19] Fix conversion from 8 and 9 to superscript 8 and 9 --- periodictable/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/periodictable/util.py b/periodictable/util.py index 0b7267c..5884f1f 100644 --- a/periodictable/util.py +++ b/periodictable/util.py @@ -116,7 +116,7 @@ def unicode_superscript(value: str) -> str: '1': '\u00B9', '0': '\u2070', 'i': '\u2071', '4': '\u2074', '5': '\u2075', '6': '\u2076', '7': '\u2077', - '9': '\u2078', '0': '\u2079', '+': '\u207a', '-': '\u207b', + '8': '\u2078', '9': '\u2079', '+': '\u207a', '-': '\u207b', '=': '\u207c', '(': '\u207d', ')': '\u207e', 'n': '\u207f', '\u2013': '\u207b', # en-dash is same as dash From 12e1a06ac27f0bd66c43659e94fffe719bbe0e2e Mon Sep 17 00:00:00 2001 From: Paul Kienzle Date: Fri, 29 May 2026 15:09:25 -0400 Subject: [PATCH 19/19] group lark parsing examples by what is being tested --- periodictable/lark_parse.py | 150 +++++++++++++++++++++--------------- 1 file changed, 86 insertions(+), 64 deletions(-) diff --git a/periodictable/lark_parse.py b/periodictable/lark_parse.py index 0b19aa6..f19efd0 100644 --- a/periodictable/lark_parse.py +++ b/periodictable/lark_parse.py @@ -749,86 +749,106 @@ def parse_formula(formula_str: str, table: PeriodicTable|None=None) -> Formula: return tree # Error conditions are marked with '!' so the exception is ignored -# Lines marked ## fail on the existing parser +# Lines marked ## fail on the pyparsing parser examples = """ -! DNA:CAGT # incorrect case for FASTA type not properly identified -! dna CAGT # missing colon in FASTA -! O² # SUPERCHARGE should be the only valid token here -! ₃H2O # badly placed subscript -! // 3g Ca # // is not a comment -! 3g Ca@ // 5g Si # missing density value -! Ca@i # missing density value ## -! Ca ⁺⁺ # extra space before valence -! Ca++ # missing braces in valence: the + is acting as SEPARATOR -! Ca2+ # missing braces in valence: the 2 is acting as COUNT and the + as SEPARATOR -! Ca{2} # missing charge in valence -! 37 vol% H2O@1 / 5% D2O@1 # missing / -! 37 vol% H2O@1 /// 5% D2O@1 # extra / -! H2O@1h # bad density mode -! 37 vol% NaCl@2.16 // H2O@1 // D2O@1 # percent missing in middle part -! 37 vol% H2O@1 // 5% D2O@1 # percent not allowed in last part -! 37 vol% H2O@1 // 5 vol% D2O@1 # only % in subsequent parts -! 37% H2O@1 // D2O@1 # missing vol% or wt% -! 37 val% H2O@1 // D2O@1 # bad spelling of vol% -! Fe[56O2 # bad isotope syntax -! Co[181] # bad isotope -! Ca{2+O2 # bad valence syntax -! Co{17-} # bad valence -! 3..5 mg NaCl -! 3.5 fm Si # bad units at the start; could be wt%/vol% or LENGTH, VOLUME, MASS -! 3.5 mm Si // 2.5 nm SiO2 // -! 3.5 mm Si // 2.5 nm SiO2 // 35 mm cG -! ((Co) # mismatched LPAR -! Co) # mismatched RPAR -! bad:CAGT # bad sequence type + +# === Composite tests === Co -dna:CAGT -(Co@5) ## -(((Co@5)@6)) ## +H2SO4 CaCO3 CaCO₃ +(Co@5) ## +(((Co@5)@6)) ## CaCO3+6H2O CaCO3 6H2O CaCO3(H2O)6 CaCO3 (H2O)6 (Ca(CO3)((H2O)6)) -CaCO₃·6H₂O ## +CaCO₃·6H₂O ## +! Bl2Oh # bad symbol +! (Co # mismatched LPAR +! Co) # mismatched RPAR +! ((Co) # mismatched LPAR +! ₃H2O # badly placed subscript + +# === Isotope tests === DHO -!Ca{2++} # bad valence string -Ca⁺⁺ # also Ca{2+} ## -O²⁻ ## H[1] -²H⁺ # D{+} ## -O²H⁻ # OD{-} ## -O²⁻H⁺ # O{2-}H{+} ## -O²⁻²H⁺ # O{2-}D{+} ## -H2O@1 -D2O@1n -D2O @ 1.11 ## -D2O@1.11i -HO{1-} +¹⁸O₂ +! Fe[56O2 # bad isotope syntax +! Co[181] # bad isotope + +# === Valence tests === +Ca{2+} +Ca{++} +Ca⁺⁺ ## +O{2-} +O{--} +O²⁻ ## +H{+} +H{-} +HO{1-} # HO- applies to the group, but valence is attached to O H[1]{1-}O -H2SO4 -C3H4H[1]NO@1.29n +²H⁺ # D{+} ## +O²H⁻ # no ambiguity since valence requires a trailing + or - ## +O²⁻H⁺ # O{2-}H{+} ## +O²⁻²H⁺ # O{2-}D{+} ## +! Ca{2} # missing charge in valence +! Ca{2++} # can't use number++ +! Ca{2+O2 # missing close brace on valence +! Co{17-} # bad valence value +! Ca ⁺⁺ # extra space before valence +! Ca++ # missing braces in valence: the + is acting as SEPARATOR +! Ca2+ # missing braces in valence: the 2 is acting as COUNT and the + as SEPARATOR +! O² # Should be looking for SUPERCHARGE (e.g., O²⁻) or SYMBOL (e.g., O²H) + +# === Density tests === +H2O@1 # density is 1, where H and O use natural abundance +H2O @ 1 # spaces allowed around '@' ## +D2O@1n # natural density "n" is 1 so isotopic density is 1.11 +D2O@1.11i # isotopic density is 1.11 +D2O@1.11 # default is "i" for isotopic density +C3H4H[1]NO@1.29n # another natural density example 78.2H2O[16] + 21.8H2O[18] @1n # density applies to composite -dna:CAGT @1n # fasta density override -50 wt% Co // Ti -33 wt% Co // 33% Fe // Ti -! 93 wt% Co // 33% Fe // Ti # More than 100 wt% -! 93 vol% Co // 33% Fe // Ti # More than 100 vol% +! 3g Ca@ // 5g Si # missing density value +! Ca@i # missing density value ## +! H2O@1h # bad density mode + +# === Mixture tests === +50 wt% Co // Ti # mix by mass; final component does need percentage +33 wt% Co // 33% Fe // Ti # intermediate components need percentage +! 93 wt% Co // 33% Fe // Ti # more than 100 wt% +! 93 vol% Co // 33% Fe // Ti # more than 100 vol% 20 vol% (10 wt% NaCl@2.16 // H2O@1) // D2O@1n -NaCl(H2O)29.1966(D2O)122.794@1.10i -5g NaCl // 50mL H2O@1 -5g NaCl@2.16 // 50mL H2O@1 -! 5g NaCl // 50mL H2O # Need density for H2O to convert volume to mass -(10 wt% NaCl // H2O)@1.07n # set density of a mixture +5g NaCl // 50mL H2O@1 # volume components need density to determine mass fraction +5g NaCl@2.16 // 50mL H2O@1 # need component densities to estimate mixture density +NaCl(H2O)29.1966(D2O)122.794@1.10i # mixture rendered as formula +! 5g NaCl // 50mL H2O # need density for H2O to convert volume to mass +(10 wt% NaCl // H2O)@1.07n # set density of a mixture 50 mL (45 mL H2O@1 // 5 g NaCl)@1.0707 // 20 mL D2O@1n 1 cm Si // 5 nm Cr // 10 nm Au -aa:RELEELNVPGEIVESLSSSEESITRINKKIEKFQSEEQQQTEDELQDKIHPFAQTQSLVYPFPGPIPNSLPQNIPPLTQTPVVVPPFLQPEVMGVSKVKEAMAPKHKEMPFPKYPVEPFTESQSLTLTDVENLHLPLPLLQSWMHQPHQPLPPTVMFPPQSVLSLSQSKVLPVPQKAVPYPQRDMPIQAFLLYQEPVLGPVRGPFPIIV +! 4 nm NaCl@2.17// 50 g Si # can't use mass in layer mixture +! 3..5 mg NaCl # bad number format +! 5 Mg NaCl // 50mL H2O@1 # bad units +! 3.5 fm Si # bad units; expecting wt%/vol% or LENGTH, VOLUME, MASS +! 3.5 mm Si // 2.5 nm SiO2 // # missing final component of mixture +! 3.5 mm Si // 2.5 nm SiO2 // 35 mm cG # bad final component of mixture +! // 3g Ca # // is not a comment +! 37 vol% H2O@1 / 5% D2O@1 # missing / +! 37 vol% H2O@1 /// 5% D2O@1 # extra / +! 37 vol% NaCl@2.16 // H2O@1 // D2O@1 # percent missing in middle part +! 37 vol% H2O@1 // 5% D2O@1 # percent not allowed in last part +! 37 vol% H2O@1 // 5 vol% D2O@1 # only % in subsequent parts +! 37% H2O@1 // D2O@1 # missing vol% or wt% +! 37 val% H2O@1 // D2O@1 # bad spelling of vol% -! Bl2Oh # Bad symbol -! 5 Mg NaCl // 50mL H2O@1 # Bad units -! 4 nm NaCl@2.17// 50 g Si # Can't use mass in layer mixture +# === FASTA tests === +dna:CAGT +dna:CAGT @1n # can override the density of a FASTA sequence +aa:RELEELNVPGEIVESLSSSEESITRINKKIEKFQSEEQQQTEDELQDKIHPFAQTQSLVYPFPGPIPNSLPQNIPPLTQTPVVVPPFLQPEVMGVSKVKEAMAPKHKEMPFPKYPVEPFTESQSLTLTDVENLHLPLPLLQSWMHQPHQPLPPTVMFPPQSVLSLSQSKVLPVPQKAVPYPQRDMPIQAFLLYQEPVLGPVRGPFPIIV +! DNA:CAGT # incorrect case for FASTA type +! dna CAGT # missing colon between FASTA type and sequence +! bad:CAGT # bad FASTA sequence type """ @@ -838,7 +858,7 @@ def check(): bad = line.startswith('!') if bad: formula = formula[1:] - if formula: + if formula.strip(): print() if bad: print(f"!!! {line[1:]}") @@ -862,6 +882,8 @@ def check(): continue # pyparsing should fail but doesn't if bad: raise RuntimeError(f"Exception not raised for <{formula}>") + else: + print(line) def main(): import sys