#!/bin/sh
#
# Extract plausible alt/title/caption text from the file/URL name in $1.
# Echoes alt text (or nothing) to stdout.
#
# Usage:
#     $0 path-to-image-file
#
# This knows about implicit naming rules for media files;
# uses various heuristics.
#
# This only inspects the final (non-directory) name portion.

# NOTE: if path-to-image-file.alt.txt exists, its contents are used.
# Those contents must be safe for an HTML alt attribute or similar.
ALTTXTFILE="$1.alt.txt"
if [ -s "$ALTTXTFILE" ]; then exec cat $ALTTXTFILE; exit 0; fi

# Try to extract something useful for the ALT tag or similar, by default empty.
# If the image path seems well-formed, eg in an expected location:
# then treating '-' as a word separator, use:
#   * the base name of the image
#   * trim from the end any short numeric + 'w' pixel width 
#   * trim from the end any short pure-numeric (number-in-series)
#   * trim from the end any of a small number of common tokens eg 'tn' or 'sq'.
# Then replace the '-'s (hyphens) with spaces.
#
# All of the following should be normalised to "hob reflections":
#     img/SMAKLIG/hob-reflections-1.jpg
#     img/SMAKLIG/hob-reflections-1-128w.jpg
#     img/SMAKLIG/hob-reflections-1-200w.jpg
#     img/SMAKLIG/hob-reflections-1-256w.jpg
#     img/autogen/hero/hob-reflections-1.l205722.800w.jpg
#
# Note that this implies trimming all file extensions if more than one '.'.
#
# This may have to be taken in several steps because of sed regex limitations.
echo "$1" | \
    sed -n -e 's|[.][^/]*\([.][^./]*\)$|\1|' \
        -e 's|^.*/\([^/]*\)[.][^./]*$|\1|p' | \
    sed -e 's/-[0-9]*w$//' -e 's/-[0-9]$//' \
        -e 's/-tn$//' -e 's/-sq$//' | \
    sed -e 's/-/ /g'

exit 0