#!/bin/sh
# Minify from mostly-XHTML source on stdin to UTF-8 HTML5 on stdout.
#
# Uses a default set of maximally-compressing parameters.
#
# Can be run with -gentle to be safer and easier to read/debug.
# In particular this doesn't throw away optional tags,
# and it tries to preserve (most) line endings.
#
# On V3.5.3 as of 2017-11-26.
# On V3.5.12 as of 2018-03-23.
# Server too old to switch to html-minifier-next V1.2.1 as of 2025-08-30.
#
# Does as many legal size reductions as possible and effective
# given the site's raw and wrapper HTML.
#
# If no suitable minifier can be found, copies stdin to stdout.
# 
# Be careful minifying JS to avoid breaking ad code in particular.
#
# May wish not to sort attributes since that forces an extra space
# before a tag's trailing '>' where the now-final attribute ends with '/'.
# But on m/index.htmlgz tested, compressed file was smaller with sort enabled.
#
# --decode-entities to convert entities to Unicode makes output non-7-bit
# which may break some recipients.
# Tested on m/index.htmlgz, and another representative popular page,
# the compressed file was larger with decode enabled.
#
# --preserve-line-breaks makes output more readable, but a little larger.
#
# --collapse-boolean-attributes removes the empty Share42 divs!
#
# DHD20181227: --sort-attributes breaks some parsing of meta tags
# (eg for Twitter og:description and Speakable xpath).
# so is omitted for now.
#
# DHD20250413: as I have not cared about Twitter for years,
# trying --sort-attributes again!
#
# DHD20250830: would like to add  --no-newlines-before-tag-close
#
# Returns any error code from the minifier.

# Ensure that /usr/local/bin is on the PATH for 'node' at al.
. script/NPM.loc
PATH="$PATH:$NPMBIN"
export PATH

# Expected location of kangax HTMLMinifier
KHM=$NPMBIN/html-minifier
#KHM=$NPMBIN/html-minifier-next

# Always-safe core options. 
COREOPTS="--minify-css true --minify-js true \
          --collapse-whitespace --conservative-collapse \
          --collapse-boolean-attributes \
          --no-include-auto-generated-tags \
          --remove-attribute-quotes \
          --remove-redundant-attributes \
          --remove-comments --remove-empty-attributes \
          --remove-script-type-attributes --remove-style-link-type-attributes \
          --sort-attributes \
          --sort-class-name"

# Stronger options that should be safe, but some parsers (eg Bing, circa 2019)
# object to the results of.
# In particular, omitting head, and body tags should be safe and is to spec.
STRONGOPTS="--remove-optional-tags"

# Gentler options that help human-readability at the expense of compression.
# Preserving line-breaks seems to diminish size reduction from ~2% to ~1%.
GENTLEOPTS="--preserve-line-breaks"

GENTLE=false
if [ "$#" -ge 1 ]; then
    if [ "-gentle" = "$1" ]; then
        GENTLE=true
    else
        echo "$0: unknown argument $1; only -gentle is expected." 1>&2
        exit 1
    fi
fi

if [ "true" = "$GENTLE" ]; then
    OPTS="$COREOPTS $GENTLEOPTS"
else
    OPTS="$COREOPTS $STRONGOPTS"
fi

if [ -x $KHM ]; then
    exec $KHM $OPTS
else
    exec cat
fi
