#!/bin/sh
# Given an article on stdin (or at least its header portion)
# writes the article title (as in the <h1> on the first line) to stdout,
# else an empty result if the title is not present or not well-formed.
#
# Any non-empty result will not contain any '<' (nor '>')
# and so no sub-tags nor premature tag endings,
# and should be safe as element content for [X]HTML.

# If -noentities then &xyx; is rewrittent to x, eg '&eacute' to 'e'.
EXTRARULE=""
case "$1" in
# Reduce &Xyx; to X, remove all other HTML entities.
-noent) EXTRARULE="-e s/&\([a-zA-Z]\)[^;]*;/\1/g -e s/&[^;]*;//g";;
esac

head -1 | sed -n $EXTRARULE -e 's/^ *<[Hh]1>\([0-9A-Z][^<>]*\)<\/[Hh]1> *$/\1/p'
