#!/bin/sh
# Convert (ASCII7) .bib BibTeX file on stdin to HTML5 list.
# The output HTML5 need not be XHTML, ie can be compact while compliant.
# Each BibTeX citation ID is prefixed with c- to make a linkable HTML ID.
# Entries that cannot be converted will be flagged as errors on stderr.
#
# Will escape HTML metacharacters for safety and correct display, eg "&".
# Will convert some LaTeX escapes to HTML, eg {\'e} to &eacute; accented "e".
#   * see https://en.wikibooks.org/wiki/LaTeX/Special_Characters#Escaped_codes
#
# { } are removed at least within fields author, title, booktitle.
#
# Aiming to very roughly ape aspects of (eg) ieeetr format
# from https://www.overleaf.com/learn/latex/Bibtex_bibliography_styles

# DHD20231004: insist that each field line end with , to keep biber happy.


##########
# May be used / adapted / etc without any promise of fitness for purpose
# under the terms of the Apache License Version 2.0, January 2004
#     http://www.apache.org/licenses/LICENSE-2.0
##########


# TODO:
#   * Deal with 'article-number' eg per:
#     https://tex.stackexchange.com/questions/259052/cite-article-number-in-proceedings-instead-of-pages


# Usage:
#     $0 [-subpage|-mainpage|-terse citationID] [-full|-lite|-d|-m|-o]
#
#         -full (or -d)  generate full HTML suitable for desktop.
#
#         -lite (or -m or -o)  generate lighter-weight HTML.
#             Suitable for mobile: omits/abbreviates metadata, abstract, etc.
#
#         -terse citationID  generate terse output for only citationID.
#             Generate only the title in a <cite>,
#             optionally linked by any URL present.
#             No list wrapper is generated.
#
#         -subpage  generate output suitable for a sub-page (not at root).
#             This has a single entry in a lightweight page below the root.
#             No list wrapper is generated.
#
#         -mainpage  generate output suitable for the main biblio page at root.
#             This has all entries linked in compact form.

LITE=false
TERSE=false
MAINPAGE=false
CITEID=""

while [ $# -gt 0 ];
    do
    case "$1" in
        -full|-d) LITE="false"; shift 1;;
        -lite|-m|-o) LITE="true"; shift 1;;
        -terse) TERSE="true"; MAINPAGE=false; CITEID="$2"; shift 2;;
        -subpage) TERSE=false; MAINPAGE=false; shift 1;;
        -mainpage) TERSE=false; MAINPAGE="true"; shift 1;;
        *) echo "ERROR: option $1 unrecognised." 1>&2; exit 1;;
    esac
    done

#echo "INFO: LITE=$LITE TERSE=$TERSE MAINPAGE=$MAINPAGE" 1>&2

# Return gawk's error code.
exec gawk -v LITE="$LITE" -v TERSE="$TERSE" -v MAINPAGE="$MAINPAGE" -v CITEID="$CITEID" '

BEGIN {
    # Maximum author count before resorting to et al.
    MAXAUTHORS=5
    if("true" == LITE) { MAXAUTHORS=1; }
    # Maximum abstract (etc) text length before folding away by default.
    # This is picked to keep entry length shorter on average.
    # We may abbreviate abstracts to about this many characters for compactness.
    #MAXTEXT=70
    MAXTEXT=250
    if("true" == LITE) { MAXTEXT=200; }

    #IGNORECASE = 1
    errstatus=""

    # Minimise metadata when LIT.
    MINMETADATA=LITE

    prevCiteID = "AAAAAAAAAAAAAAAAAAAAAAAA";

    # Force a clear:both and contain:content for robustness and efficiency.
    if("true" != TERSE) { print "<ul class=\"cb conc\">"; }

    if("true" != TERSE) {
        # Location of copyright notice -> licence URL lookup file.
        # See end for format.
        C2L="db.bibliography/copyright-licenceURL.csv"
        # Read in C2L if not in terse mode.
        c2lindex = 0;
        while(getline line < C2L) {
            ++c2lindex;
            if(match(line, /^#/)) { continue; } # Skip comment lines.
            # Cheat and strip out all quotes in one go...
            gsub(/"/, "", line);
            # Split into text and URL.
            ns = split(line, ac2l, /,/);
            if(2 != ns) {
                print "ERROR: bad line in "C2L" : "line | "cat 1>&2";
                break;
                }
            # File in the lookup array.
            ctext=ac2l[1];
            lURL=ac2l[2];
            C2Lc[c2lindex] = ctext;
            C2Ll[c2lindex] = lURL;
            }
        close(C2L);
if(C2Llookup>0) { print "INFO: C2L copyright to URL entries: "length(C2Llookup) | "cat 1>&2"; }
        }
    }

# Ignore blank lines.
/^ *$/ {
    # No further processing of this line.
    next;
}

# Ignore comments.
/^ *%/ {
    # No further processing of this line.
    next;
}

# Report as errors lines containing non-7-bit (ASCII) characters.
# These may be interpretted differently in (say) LaTeX and HTML,
# so should be encoded as a LaTeX esvape sequence if needed.
/[^ -~]/ {
    badword=""
    for(i=1; i<=NF; i++) {
        if($i ~ /[^ -~]/) { badword=$i; break; }
        }
    errstatus = "non-ASCII char(s) in ["citeID"] at line "NR" - "badword" - "$0;
    }

# Record starts @type{citeID,
/^@/ {
    if(("" != type) || ("" != citeID)) {
        errstatus = "missing close / record end before line "NR;
    }

    ++count;

    match($0, /^@([a-zA-Z]+){/, ta);
    match($0, /{([a-zA-Z0-9:_-]+) *, *$/, ia);
    type = tolower(ta[1]);
    citeID = ia[1];
    # Clear all record attributes:
    #     https://unix.stackexchange.com/questions/147957/delete-an-array-in-awk
    split("", attributes);

    # No further processing of this line.
    next;
}

# Record ends with '}' on its own line.
/^ *} *$/ {
    title = attributes["title"]; delete attributes["title"];
    url = attributes["url"]; delete attributes["url"];

    if(("" == type) || ("" == citeID)) {
        errstatus = "unexpected close / record end on line "NR;
    } else if("true" == TERSE) {
        # If terse, output terse form, and only for the specific ID.
        if(citeID == CITEID) {
            if("" != title) {
                if("" != url) { printf("<a href=\"%s\">", url); }
                printf("<cite>%s</cite>", title);
                if("" != url) { printf("</a>"); }
                }
            }
    } else if("true" == MAINPAGE) {
        # For main page (full bibliography) keep things brief...
        if("true" != LITE) {
            # For now keep 'c-XXX' target for old links into desktop main page.
            printf("<li id=c-%s>", citeID);
        } else {
            # DHD20250421: direct links should be to the sub-page, not main.
            printf("<li>");
            }

        # Link to sub-page entry.
        printf("[<a href=bibliography/%s.html>%s</a>] ", citeID, citeID);

	if("" != title) {
	    if("" != url) { printf("<a href=\"%s\">", url); }
	    printf("%s", title);
	    if("" != url) { printf("</a>"); }
            print ""; # Force newline to make HTML easier to read...
	    }
    } else {
        # Sub-page generation.
        # Generate the HTML record from the data captured.
        # A leading comma and space is added to most elements.

        # Check ordering.
        if(tolower(prevCiteID) >= tolower(citeID)) {
            errstatus = "IDs out of order, "prevCiteID" should be after "citeID;
            }
        prevCiteID = citeID;

	# Collect some attributes early.
	journal = attributes["journal"]; delete attributes["journal"];

	# Open a new list item, with a linkable HTML ID.
	CWType="CreativeWork"
	workStatus=""
	if("article" == type) {
	    CWType="Article";
	    if("" != journal) { CWType="ScholarlyArticle"; }
	    workStatus="Published";
	    }
	if("book" == type) { CWType="Book"; workStatus="Published"; }
	if("dataset" == type) { CWType="Dataset"; workStatus="Published"; }
	if("mastersthesis" == type) { CWType="Thesis"; workStatus="Published"; }
	if("online" == type) { CWType="WebPage"; } # BibLaTeX?
	if("phdthesis" == type) { CWType="Thesis"; workStatus="Published"; }
	if("report" == type) { CWType="Report"; workStatus="Published"; }
	if("techreport" == type) { CWType="Report"; workStatus="Published"; }
	if("thesis" == type) { CWType="Thesis"; workStatus="Published"; }
	if("unpublished" == type) { workStatus="Draft"; } # Eg preprint.
	if("www" == type) { CWType="WebPage"; } # MDPI journals

        #printf("<li id=c-%s itemprop=citation itemscope itemtype=http://schema.org/%s>", citeID, CWType);
        # DHD20250421: ID is redundant the single-entry sub-pages.
        printf("<li itemprop=citation itemscope itemtype=http://schema.org/%s>", CWType);
        printf("[");
        if("true" != MINMETADATA) { printf("<span itemprop=alternateName>"); }
# Linking canonically: //WWW. will initially be used as-is in sub-pages.
        #/printf("<a href=#c-%s>%s</a></span>]", citeID, citeID);
        printf("<a href=//WWW.earth.org.uk/bibliography/%s.html>%s</a></span>]", citeID, citeID);

        author = attributes["author"]; delete attributes["author"];
        if("" != author) {
            # Special case for DHD!
            DHD1="Damon Hart-Davis";
            DHD2="Hart-Davis, Damon";
            # Nominally the inner span itemprop=name is redundant.
            #DHDMD=" <span itemprop=\"author creator\" itemscope itemtype=http://schema.org/Person itemref=pgAuthor><span itemprop=name>%s</span></span>";
            DHDMD=" <a href=http://d.hd.org itemprop=\"author creator\" itemscope itemtype=http://schema.org/Person><span itemprop=name>%s</span></a>";
            #if("true" == MINMETADATA) { DHDMD=" %s"; }

            # Split at "; " or " and " into multiple author entries.
            # The ; may not be part of an HTML entity, eg Ren&eacute; surname.
            ns = split(author, as, / *;  */);
            na = split(author, aa, / * and  */);
#print "INFO: author count by ;/and "ns"/"na" for "citeID | "cat 1>&2"
            if((ns == 1) && (na == 1)) {
                # Single author.
                # This could be a person or an organisation.
                if("true" == MINMETADATA) {
                    printf(" %s", author);
                } else if(1 == split(author, a1s, " ")) {
                    # Taken to be an organisation if a single word.
                    printf(" <span itemprop=\"author creator\" itemscope itemtype=http://schema.org/Organization><span itemprop=name>%s</span></a></span>", author);
                } else if((author == DHD1) || (author == DHD2)) {
                    # Special case if DHD.
                    printf(DHDMD, author);
                } else {
                    # A type cannot be inferred.
                    printf(" <span itemprop=author>%s</span>", author);
                    }
            } else if((ns > 1) && (na > 1)) {
                if("true" == MINMETADATA) {
                    printf(" %s", author);
                # May get issues with HTML accent entities...
print "INFO: *** ambiguous author split for *** "citeID | "cat 1>&2"
                # Do not split in this ambiguous case.
                # DHD20231112: this may well not be a (single) human.
                } else {
                    printf(" <span itemprop=author>%s</span>", author);
                    }
            } else {
                n = (ns > 1) ? ns : na;
                np = n;
                if(n > MAXAUTHORS) { np = MAXAUTHORS; }
                for(i = 1; i <= np; ++i) {
                    auth = (ns > 1) ? (as[i]) : (aa[i]);
#print "INFO: author "i" for "citeID" "auth | "cat 1>&2"
                    # Assume that each entry is a person.
                    if("true" == MINMETADATA) {
                        printf(" %s", auth);
                    } else if((auth == DHD1) || (auth == DHD2)) {
                        printf(DHDMD, auth);
                    } else {
                        printf(" <span itemprop=\"author creator\" itemscope itemtype=http://schema.org/Person><span itemprop=name>%s</span></a></span>", auth);
                        }
                    if(i < np) { printf(" and "); } # robust
                    }
                if(n > MAXAUTHORS) { printf(" et al."); }
                }
            }

        #title = attributes["title"]; delete attributes["title"];
        #url = attributes["url"]; delete attributes["url"];
        urldate = attributes["urldate"]; delete attributes["urldate"];
        if("" != title) {
            printf(" ");
            if("" != url) { printf("<a href=\"%s\" itemprop=\"url sameAs\">", url); }
            printf("<cite itemprop=\"headline name\">%s</cite>", title);
            if("" != url) { printf("</a>"); }
            if(("" != urldate) && ("true" != LITE)) { printf(" (accessed <time>%s</time>)", urldate); }
            }

        editor = attributes["editor"]; delete attributes["editor"];
        if("" != editor) { printf(", editor(s) <span itemprop=editor>%s</span>", editor); }

        booktitle = attributes["booktitle"]; delete attributes["booktitle"];
        if("" != booktitle) { printf(", in <span itemprop=isPartof>%s</span>", booktitle); }

        # Generally only expected for book as-is.
        edition = attributes["edition"]; delete attributes["edition"];
        editionItemprop="version"
        if("book" == type) { editionItemprop="bookEdition version"; }
        if("" != edition) { printf(", edition <span itemprop=\"%s\">%s</span>", editionItemprop, edition); }

        series = attributes["series"]; delete attributes["series"];
        if("" != series) { printf(", <span itemprop=isPartof>%s</span>", series); }

        publisher = attributes["publisher"]; delete attributes["publisher"];
        #if("" != publisher) { printf(", <span itemprop=publisher>%s</span>", publisher); }
        if("" != publisher) { printf(", <span itemprop=publisher itemscope itemtype=http://schema.org/Organization><span itemprop=name>%s</span></a></span>", publisher); }

        institution = attributes["institution"]; delete attributes["institution"];
        if("" != institution) {
            # If there is no author(s), the institution becomes the creator.
            # NOTE: author must have been extract before this.
            institutionItemprop="sourceOrganization";
            if("" == author) {
                institutionItemprop="\"sourceOrganization creator\"";
                }
            printf(", ");
            if("true" != MINMETADATA) { printf("<span itemprop=%s itemscope itemtype=http://schema.org/Organization><span itemprop=name>", institutionItemprop); }
            printf("%s", institution);
            if("true" != MINMETADATA) { printf("</span></span>"); }
            }

        school = attributes["school"]; delete attributes["school"];
        if("" != school) {
            schoolItemprop="sourceOrganization";
            printf(", school ");
            if("true" != MINMETADATA) { printf("<span itemprop=%s itemscope itemtype=http://schema.org/Organization><span itemprop=name>", schoolItemprop); }
            printf("%s", school);
            if("true" != MINMETADATA) { printf("</span></span>"); }
            }

        # Generally expect at most one of city or address or place.
        city = attributes["city"]; delete attributes["city"];
        if("" != city) { printf(", <span itemprop=locationCreated>%s</span>", city); }
        address = attributes["address"]; delete attributes["address"];
        if("" != address) { printf(", <span itemprop=locationCreated>%s</span>", address); }
        place = attributes["place"]; delete attributes["place"];
        if("" != place) { printf(", <span itemprop=locationCreated>%s</span>", address); }

        # Month may be text or numeric.
        # If 2-digit numeric (eg 03) assume month and day good for ISO 8601.
        # Accepts 3-letter lower-case English abbreviations also.
        day = attributes["day"]; delete attributes["day"];
        month = attributes["month"]; delete attributes["month"];
        year = attributes["year"]; delete attributes["year"];
        if("jan" == month) { month = "01"; }
        if("feb" == month) { month = "02"; }
        if("mar" == month) { month = "03"; }
        if("apr" == month) { month = "04"; }
        if("may" == month) { month = "05"; }
        if("jun" == month) { month = "06"; }
        if("jul" == month) { month = "07"; }
        if("aug" == month) { month = "08"; }
        if("sep" == month) { month = "09"; }
        if("oct" == month) { month = "10"; }
        if("nov" == month) { month = "11"; }
        if("dec" == month) { month = "12"; }
        if("" != year) {
            isodate = year;
            if(month ~ /^[01][0-9]$/) {
                isodate = isodate "-" month;
                if(day ~ /^[0-3][0-9]$/) { isodate = isodate "-" day; }
            } else if("" != month) {
                errstatus = "month ("month") format should be MM [0-3][0-9] for " citeID;
            }
            printf(", ", isodate);
            if("true" != MINMETADATA) { printf("<time itemprop=datePublished>"); }
            printf("%s", isodate);
            if("true" != MINMETADATA) { printf("</time>"); }
            }

        # The journal value is collected earlier.
        #journal = attributes["journal"]; delete attributes["journal"];
        if("" != journal) { printf(", <cite itemprop=publisherImprint>%s</cite>", journal); }

        volume = attributes["volume"]; delete attributes["volume"];
        if("" != volume) { printf(", volume <span itemprop=position>%s</span>", volume); }
        # Ideally would use volumeNumber.

        nidType="identifier";
        if("Report" == CWType) { nidType="reportNumber"; }
        number = attributes["number"]; delete attributes["number"];
        if("" != number) { printf(", report/number <span itemprop=%s>%s</span>", nidType, number); }

        issue = attributes["issue"]; delete attributes["issue"];
        if("" != issue) { printf(", issue <span itemprop=identifier>%s</span>", nidType, issue); }

        iidType="identifier";
        if("book" == type) { iidType="isbn"; }
        isbn = attributes["isbn"]; delete attributes["isbn"];
        if("" != isbn) { printf(", ISBN <span itemprop=%s>%s</span>", iidType, isbn); }
        eisbn = attributes["eisbn"]; delete attributes["eisbn"];
        if("" != eisbn) { printf(", eISBN <span itemprop=%s>%s</span>", iidType, eisbn); }

        issn = attributes["issn"]; delete attributes["issn"];
        if("" != issn) { printf(", ISSN <a href=\"https://portal.issn.org/resource/ISSN/%s\"><span itemprop=identifier>%s</span></a>", issn, issn); }

        eissn = attributes["eissn"]; delete attributes["eissn"];
        if("" != eissn) { printf(", eISSN <span itemprop=identifier>%s</span>", eissn); }

        lccn = attributes["lccn"]; delete attributes["lccn"];
        if("" != lccn) { printf(", LCCN <span itemprop=identifier>%s</span>", lccn); }

        pubmedid = attributes["pubmedid"]; delete attributes["pubmedid"];
        if("" != pubmedid) { printf(", PubMedID <a href=\"https://pubmed.ncbi.nlm.nih.gov/%s/\"><span itemprop=\"identifier sameAs\">%s</span></a>", pubmedid, pubmedid); }

        doi = attributes["doi"]; delete attributes["doi"];
        if("" != doi) {
            # Strip any leading https://doi.org/ (or http) portion and complain.
            if(0 != match(doi, /^https?:/)) {
print "INFO: *** DOI supplied as URL for *** "citeID | "cat 1>&2"
                sub(/^https:\/\/[.A-Za-z0-9-]*\//, "", doi);
                }
            # Wikipedia: The official DOI Handbook explicitly states that DOIs should display on screens and in print in the format doi:10.1000/182.
            printf(", doi:<a href=\"https://doi.org/%s\"><span itemprop=\"identifier sameAs\">%s</span></a>", doi, doi);
            }

        pages = attributes["pages"]; delete attributes["pages"];
        # There is a lot of noise in the 'pages' field.
        if(("" != pages) && ("true" != LITE)) {
            if("article" == type) {
                printf(", article/pages <span itemprop=pagination>%s</span>", pages);
                } else {
                printf(", pages %s", pages);
                }
            }

        # Copyright notice.
        copyright = attributes["copyright"]; delete attributes["copyright"];
        if(("" != copyright) && ("true" != LITE)) {
            printf(", copyright ");
            # Check for a mapping to a licence URL.
            # If so, can wrap in a license URL link, else warn.
            license = "";
            for(c2li in C2Lc) {
                if(copyright == C2Lc[c2li]) {
                    license = C2Ll[c2li];
                    break;
                    }
                }
            if("" != license) {
                printf("<a href=\"%s\" itemprop=license>", license);
                } else {
                print "INFO: *** freestyle copyright text "length(C2Llookup)" >"copyright"< *** "citeID | "cat 1>&2"
                }
            printf("<span itemprop=copyrightNotice>%s</span>", copyright);
            if("" != license) {
                printf("</a>");
                }
            }

        language = attributes["language"]; delete attributes["language"];
        if(("" != language) && ("true" != LITE)) { printf(", language <span itemprop=inLanguage>%s</span>", language); }

        # Link to alternative copies, eg PDF, if any, or a text note.
        # DHD20231016: was misusing eprint for this.
        howpublished = attributes["howpublished"]; delete attributes["howpublished"];
        if(("" != howpublished) && ("true" != LITE)) {
            # Treat as URL if wrapped as \url{...}.
            if(howpublished ~ /^\\url/) {
                hpurl=substr(howpublished, 5);
                if(hpurl != url) {
                    hploc="also at";
                    if(0 != match(howpublished, /pdf$/)) { hploc = "PDF"; }
                    printf(", <span itemprop=encoding itemscope itemtype=http://schema.org/MediaObject><a href=\"%s\" itemprop=\"url contentUrl\">%s</a></span>", hpurl, hploc);
                }
            } else {
                printf(", published %s", howpublished);
                }
            }

        crossref = attributes["crossref"]; delete attributes["crossref"];
        if("" != crossref) {
            printf(", see also [%s]", crossref);
            }

        # Note the format (book, article, etc).
        if("true" != LITE) {
            printf(" (<span itemprop=learningResourceType>%s</span>)", type);
        }

        # Note the status, if any.
        if(("" != workStatus) && ("true" != MINMETADATA)) {
            printf("<meta itemprop=creativeWorkStatus content=%s>", workStatus);
            }

        # Link to BibTeX single-entry source file.
        # (This link should work for the m. version also.)
        # TODO: should this be nested in a 'subjectOf' or similar?
        printf(" (");
        if("true" != MINMETADATA) { printf("<span itemprop=encoding itemscope itemtype=http://schema.org/DataDownload>"); }
        printf("<a href=//WWW.earth.org.uk/db.bibliography/single/%s.bib", citeID);
        if("true" != MINMETADATA) { printf(" itemprop=contentUrl"); }
        printf(">BibTeX</a>");
        if("true" != MINMETADATA) { printf("<meta itemprop=encodingFormat content=text/x-bibtex></span>"); }
        printf(")");

        # End the sentence but allow items such as abstract after.
        printf(".");

        # Keywords
        # Render smaller as may be long.
        keywords = attributes["keywords"]; delete attributes["keywords"];
        if(("" != keywords) && ("true" != LITE)) {
            #printf(", keywords [<span style=font-size:smaller itemprop=keywords>%s</span>]", keywords);
            printf(" <details style=font-size:smaller> <summary>keywords</summary> <q itemprop=keywords>%s</q></details>", keywords);
            }

        # Abstract.
        # Render smaller and as summary/details as may be long.
        # Omit unless in 'fullfat' mode.
        # Retain initial part of abstract as description for dataset.
        abstract = attributes["abstract"]; delete attributes["abstract"];
        if("" != abstract) {
            if("false" == LITE) {
                printf(" <details style=font-size:smaller> <summary>abstract</summary> <q itemprop=\"abstract description\">%s</q></details>", abstract);
            } else if(("dataset" == type) && ("true" != LITE)) {
                synthdescription=""
                split(abstract, abstractwords);
                for(i in abstractwords) {
                    synthdescription = synthdescription abstractwords[i] " ";
                    if(length(synthdescription) >= MAXTEXT) {
                        synthdescription = synthdescription "...";
                        break;
                        }
                    }
                # Strip any trailing space...
                sub(/ *$/, "", synthdescription);
                printf(" <details style=font-size:smaller> <summary>abstract</summary> <q itemprop=\"abstract description\">%s</q></details>", synthdescription);
                }
            }
        if((length(abstract) < 50) && ("dataset" == type)) {
            # DHD20241023: Google wants a description of 50--5000 chars.
            errstatus = "dataset must have an abstract/description of at least 50 chars (currently "length(abstract)") for " citeID;
            }

        # Annotation, eg key quote(s).
        # Render smaller and as summary/details as may be long.
        # Omit unless in 'fullfat' mode.
        annote = attributes["annote"]; delete attributes["annote"];
        if(("" != annote)  && ("false" == LITE)) {
            #printf(", note <span style=font-size:smaller itemprop=comment itemscope itemtype=http://schema.org/Comment>[<span itemprop=text>%s</span>]</span>", annote);
            open=""
            # If not in lite mode, have notes open and searchable by default.
            if("false" == LITE) { open=" open"; }
            printf(" <details style=font-size:smaller%s> <summary>note</summary> <span itemprop=comment itemscope itemtype=http://schema.org/Comment>[<span itemprop=text>%s</span>]</span></details>", open, annote);
            }

        # End the line.
        print "";

        # Note about missing URL info, which is especially useful.
        # Forgive older items, and maybe books.
        if(("" == url) && (year>=2010)) {
            print "INFO: *** no URL for *** "citeID" year "year | "cat 1>&2"
            }

        # Warn of any unconsumed attributes.
        for(key in attributes) {
            errstatus = "unused attribute "key" in record closing on line "NR;
            break;
            }
    }

    # Clear out the key indicators before the next record starts.
    type = "";
    citeID = "";

    # No further processing of this line.
    next;
}

# Generic case to pick up fields of form "field = X,".
# Note: can be attribute = {X} or = "X" or plain X if X is a number.
# DHD20231004: insist that each field line end with , to keep biber happy.
/^ *([a-zA-Z]+) *= */ {
    match($0, /^ *([a-zA-Z]+) *= */, a);

    # DHD20231004: error if no trailing , present but continue
    if($0 !~ /, *$/) {
        errstatus = "missing trailing , for field in "citeID" on line "NR": "$0;
        #next;
        }
    # Extract a {} or "" wrapped value.
    match($0, /^[^=]* *= *["{]?(.*)["}]?, *$/, b);

    if(("" == a[1]) || ("" == b[1])) {
        errstatus = "unparseable field in "citeID" line "NR": "$0;
        next;
        }

    value = b[1];

    # Strip leading/trailing whitespace.
    gsub(/^[ \t]*/, "", value);
    gsub(/[ \t]*$/, "", value);

    # Early convert a few LaTeX character escapes back to raw.
    gsub(/\\&/, "\\&", value); # \& in LaTeX for &.
    gsub(/\\%/, "%", value); # % is a LaTeX comment character.
    #gsub(/\\[$]/, "$", value); # $ introduces maths.

    # Make value HTML5-safe.
    # & to &amp;
    # < to &lt;
    gsub(/&/, "\\&amp;", value);
    gsub(/</, "\\&lt;", value);

    # Long dashes.
    gsub(/---/, "\\&mdash;", value);
    gsub(/--/, "\\&ndash;", value);

    # Word tie ~ to &nbsp; where safe.
    gsub(/[~]/, "\\&nbsp;", value);
    # $\sim$ back to ~.
    gsub(/[$]\\sim[$]/, "~", value);

    # Convert some LaTeX character escapes and accents to HTML.
    # See https://en.wikibooks.org/wiki/LaTeX/Special_Characters#Escaped_codes
    gsub(/{\\'\''a}/, "\\&aacute;", value);
    gsub(/{\\~a}/, "\\&atilde;", value);
    gsub(/{\\\^a}/, "\\&acirc;", value);
    gsub(/{\\"a}/, "\\&auml;", value);
    gsub(/{\\c{C}}/, "\\&Ccedil;", value);
    gsub(/{\\c{c}}/, "\\&ccedil;", value);
    gsub(/{\\degree}/, "\\&deg;", value);
    gsub(/{\\'\''E}/, "\\&Eacute;", value);
    gsub(/{\\'\''e}/, "\\&eacute;", value);
    gsub(/{\\`e}/, "\\&egrave;", value);
    gsub(/{\\"e}/, "\\&euml;", value);
    gsub(/{\\u{g}}/, "\\&gbreve;", value);
    gsub(/{\\[.]I}/, "\\&Idot;", value);
    gsub(/{\\'\''i}/, "\\&iacute;", value);
    gsub(/{\\\^i}/, "\\&icirc;", value);
    gsub(/{\\i}/, "\\&inodot;", value);
    gsub(/{\\l}/, "\\&lstrok;", value);
    gsub(/{\\micro}/, "\\&micro;", value);
    gsub(/{\\~n}/, "\\&ntilde;", value);
    gsub(/{\\'\''o}/, "\\&oacute;", value);
    gsub(/{\\`o}/, "\\&ograve;", value);
    gsub(/{\\"o}/, "\\&ouml;", value);
    gsub(/{\\o}/, "\\&oslash;", value);
    gsub(/{\\pi}/, "\\&pi;", value);
    gsub(/{\\'\''s}/, "\\&sacute;", value);
    gsub(/{\\c{s}}/, "\\&scedil;", value);
    gsub(/{\\'\''u}/, "\\&uacute;", value);
    gsub(/{\\"U}/, "\\&Uuml;", value);
    gsub(/{\\"u}/, "\\&uuml;", value);
    gsub(/{\\pound}/, "\\&pound;", value);
    gsub(/\\%/, "%", value);
    gsub(/\\&/, "&amp;", value);

    # Remove Bibtex-specific { } to mark initial etc.
    gsub(/[{}]/, "", value);

    attributes[tolower(a[1])] = value;

    # No further processing of this line.
    next;
}

# Unexpected content.
{
    errstatus = "unexpected content on line "NR": "$0;
    # No further processing of this line.
    next;
}

END {
    if("true" != TERSE) { print "</ul>"; }
    if(("true" != TERSE) && (count > 1)) {
        print "<p>(Entries: "count")</p>";
        }

    if("" != errstatus) {
        print "ERROR: "errstatus | "cat 1>&2"
        exit 1
        }
}

'



echo "ERROR: should not get here!" 1>&2
exit 2


# Example input records.

@misc{hart-davis202216ww,
  title={16WW Eddi PV DHW Diverter Export Margin Analysis (2022-08)},
  author={Hart-Davis, Damon},
  url={https://www.earth.org.uk/eddi-diverter-export-margin-analysis.html},
  number={EOU-eddi-diverter-export-margin-analysis},
  year={2022},
}

@article{march1989cloning,
  title={Cloning and characterization of an Escherichia coli gene, pcnB, affecting plasmid copy number},
  author={March, JB and Colloms, MD and Hart-Davis, D and Oliver, IR and Masters, M},
  journal={Molecular microbiology},
  volume={3},
  number={7},
  pages={903--910},
  year={1989},
  publisher={Wiley Online Library},
}

exit 3


### Additional info:

Wikipedia: The official DOI Handbook explicitly states that DOIs should display on screens and in print in the format doi:10.1000/182 .

# C2L copyright notice -> licence URL lookup file format:
# Mapping from copyright BibTeX field to schema.org/license URL.
"Creative Commons Attribution 4.0 International","https://creativecommons.org/licenses/by/4.0/"
"Creative Commons Attribution-NonCommercial 4.0","https://creativecommons.org/licenses/by-nc/4.0/"
