#!/bin/sh
# Extract keyword tags, if any, from raw input HTML file on stdin.
#
# Parameters:
#
# [none]
#     Returns canonicalised (lowercased, sorted) keywords if any, else ""
#     The canonicalised tags are space separated.
#
# -raw
#     Returns raw tags as extracted from the comment/directive if any, else ""
#
# -major
#     Returns canonical tags filtered to only have 'major' keywords,
#     which are broadly those that styling are provided for.
#     These correspond to 'sections' from a magazine, for example.
#
# -styled
#     Returns (canonicalised) tags as semantic HTML with CSS styling
#
# -contains keyword (keyword*)
#     Returns first listed keyword found if any, else ""
#     Matching is case-insensitive.
#
#
# Tags should be in HTML comments on their own at the top of the document like:
#
#     <!-- TAGS EASYREAD -->
#
# or:
#
#     <!-- TAGS EASYREAD TECH OTHER -->
#
# The tags comments should not have anything (even whitespace) before or after
# on its line, and there should only be at most one such directive/line.
#
# Order of tags is unimportant.
#
# Matching is case-insensitive, but directives should be uppercase.

TAGS="`sed -n -e 's/^<!-- *TAGS \([A-Za-z ]*\) *-->$/\1/p'`"

# Exit immediately with raw tags (if any) if so requested.
if [ "-raw" = "$1" ]; then echo "$TAGS"; exit 0; fi

# Canonicalise (sort, make lower-case).
LCTAGS="`echo $TAGS | tr '[A-Z]' '[a-z]'`"
CANONTAGS="`echo $LCTAGS | xargs -n1 | sort -u | xargs`"

# Return only 'major' tags.
# If TAGCSS file is not found, no tags will be treated as 'major'.
if [ "-major" = "$1" ]; then
    TAGCSS=img/css/uncompressed/tag.css
    if [ ! -s "$TAGCSS" ]; then
        echo "ERROR: $0: cannot find $TAGCSS to support -major" 1>&2
        exit 1
    fi
    if [ "" != "$CANONTAGS" ]; then
        MAJORTAGS=""
        for t in $CANONTAGS;
        do
            if egrep -q '^[.]tag-'$t'{' $TAGCSS; then
                if [ "" != "$MAJORTAGS" ]; then MAJORTAGS="$MAJORTAGS "; fi
                MAJORTAGS="${MAJORTAGS}$t"
            fi
        done
        if [ "" != "$MAJORTAGS" ]; then echo $MAJORTAGS; fi
    fi
    exit 0;
fi

# Return CSS-styled HTML tags.
# Empty if no tags, else of the form for each tag %s:
#     <span class=tag-%s>%s</span>
if [ "-styled" = "$1" ]; then
    if [ "" != "$CANONTAGS" ]; then
        echo $CANONTAGS | awk '
            BEGIN { printf("<i>("); }
            {
            for(i = 1; i <= NF; ++i) {
                if(1 != i) { printf(" "); }
                printf("<span class=tag-%s>%s</span>", $i, $i);
                }
            }
            END { printf(")</i>"); }'
    fi
    exit 0;
fi

# Attempt to match against one of the supplied keywords.
# Print/return first match, canonicalised.
# If no match then return nothing ("").
if [ "-contains" = "$1" ]; then
    shift
    for k in $*;
        do
            lck="`echo $k | tr '[A-Z]' '[a-z]'`"
            for j in $CANONTAGS; do
                if [ "$lck" = "$j" ]; then
                    echo $lck
                    exit 0
                fi
            done
        done
    exit 0
fi


# By default echo the canonicalised tags, or nothing ("") if none.
echo $CANONTAGS

exit 0
