#!/bin/sh
# Prepare a set of RSS stats files under OUTPUTDIR based on Web server logs.
# By default uses logs for the last full ~week, for main EOU site.
# Filtering is applied early in the data pipeline to partly anonymise.
#
# Usage: $0 [-i INPUTFILE]

# Log input is of the form:
#www.earth.org.uk:80 1.2.3.4 - - [07/Apr/2024:16:45:21 +0000] "GET /rss/podcast.rss HTTP/1.1" 200 11671 "-" "Amazon Music Podcast"

# Input file (default to last full ~week, uncompressed).
INPUTFILE=/var/log/apache2/other_vhosts_access.log.1
if [ "-i" = "$1" ]; then
    INPUTFILE="$2"
    shift
    shift
fi
if [ ! -s "$INPUTFILE" ]; then
    echo "ERROR: missing input file $INPUTFILE" 1>&2
    exit 1
fi

# Output directory (no trailing slash).
OUTPUTDIR=/tmp/stats.out
if [ ! -d "$OUTPUTDIR" ]; then mkdir "$OUTPUTDIR"; fi

# IP obfuscator.
IPOBFUSC="hideLastIPOctet.sh"

# Site filter regex (default to main EOU site).
FILTERSITE='^www.earth.org.uk:'

# RSS podcast filter regex.
FILTERRSS=' \/rss\/podcast(-lite)?\.rss '

# Extract first and last entry timestamps, all assumed UTC
FIRSTTS="$(head -1 < "$INPUTFILE" | awk '{print substr($5, 2)}' | sh logDateToISO8600.sh)"
LASTTS="$(tail -1 < "$INPUTFILE" | awk '{print substr($5, 2)}' | sh logDateToISO8600.sh)"

echo "INFO: $OUTPUTDIR/interval.txt: $FIRSTTS to $LASTTS inclusive log data"
echo "$FIRSTTS to $LASTTS inclusive log data" > "$OUTPUTDIR"/interval.txt
sh intervalDays.sh < "$OUTPUTDIR"/interval.txt > "$OUTPUTDIR"/intervalDays.txt
echo "INFO: $OUTPUTDIR/intervalDays.txt: $(cat "$OUTPUTDIR"/intervalDays.txt)"

HITSALL="$(wc -l < "${INPUTFILE}")"
HITSSITE="$(awk < "${INPUTFILE}" "/${FILTERSITE}/" | wc -l)"
HITSRSS="$(awk < "${INPUTFILE}" "/${FILTERSITE}/ && /${FILTERRSS}/" | wc -l)"
echo "INFO: hits: all ${HITSALL}, site ${HITSSITE}, feed ${HITSRSS}"
(echo "all: ${HITSALL}"; echo "site: ${HITSSITE}"; echo "feed: ${HITSRSS}";) > "$OUTPUTDIR"/hits.txt

BYTESALL="$(awk < "${INPUTFILE}" '{sum+=$11}END{print sum}')"
BYTESSITE="$(awk < "${INPUTFILE}" "/${FILTERSITE}/" | awk '{sum+=$11}END{print sum}')"
BYTESRSS="$(awk < "${INPUTFILE}" "/${FILTERSITE}/ && /${FILTERRSS}/" | awk '{sum+=$11}END{print sum}')"
echo "INFO: bytes: all ${BYTESALL}, site ${BYTESSITE}, feed ${BYTESRSS}"
(echo "all: ${BYTESALL}"; echo "site: ${BYTESSITE}"; echo "feed: ${BYTESRSS}";) > "$OUTPUTDIR"/bytes.txt

echo "INFO: $OUTPUTDIR/allHitsByHour.log: all hits by hour (UTC)..."
sh hideLastIPOctet.sh < "${INPUTFILE}" | sh hitsByHour.sh > "$OUTPUTDIR"/allHitsByHour.log
head -5 "$OUTPUTDIR"/allHitsByHour.log

echo "INFO: $OUTPUTDIR/siteHitsByHour.log: site hits by hour (UTC)..."
sh hideLastIPOctet.sh < "${INPUTFILE}" | awk "/${FILTERSITE}/" | sh hitsByHour.sh > "$OUTPUTDIR"/siteHitsByHour.log
head -5 "$OUTPUTDIR"/siteHitsByHour.log

echo "INFO: $OUTPUTDIR/feedHits.log: RSS feed hits..."
sh hideLastIPOctet.sh < "${INPUTFILE}" | awk "/${FILTERSITE}/ && /${FILTERRSS}/" > "$OUTPUTDIR"/feedHits.log
head -5 "$OUTPUTDIR"/feedHits.log

echo "INFO: $OUTPUTDIR/feedHitsByUA.log: feed hits by UA..."
sh hideLastIPOctet.sh < "${INPUTFILE}" | awk "/${FILTERSITE}/ && /${FILTERRSS}/" | sh hitsByUA.sh > "$OUTPUTDIR"/feedHitsByUA.log
head -5 "$OUTPUTDIR"/feedHitsByUA.log

echo "INFO: $OUTPUTDIR/feedHitsByHour.log: feed hits by hour (UTC)..."
sh hideLastIPOctet.sh < "${INPUTFILE}" | awk "/${FILTERSITE}/ && /${FILTERRSS}/" | sh hitsByHour.sh > "$OUTPUTDIR"/feedHitsByHour.log
head -5 "$OUTPUTDIR"/feedHitsByHour.log

echo "INFO: $OUTPUTDIR/feedStatusByUA.log: feed hits and status by UA..."
sh hideLastIPOctet.sh < "${INPUTFILE}" | awk "/${FILTERSITE}/ && /${FILTERRSS}/" | sh statusByUA.sh > "$OUTPUTDIR"/feedStatusByUA.log
head -5 "$OUTPUTDIR"/feedStatusByUA.log

echo "INFO: $OUTPUTDIR/feedStatusByHour.log: feed hits and status by hour (UTC)..."
sh hideLastIPOctet.sh < "${INPUTFILE}" | awk "/${FILTERSITE}/ && /${FILTERRSS}/" | sh statusByHour.sh > "$OUTPUTDIR"/feedStatusByHour.log
head -5 "$OUTPUTDIR"/feedStatusByHour.log
