#!/bin/sh
# Merge (eg energy) data from across devices into one CSV table.
# Assumes that there is exactly one (dense: no missing dates) stream per input.
# The input file names will be parsed for variable-granularity-device.
# The inputs must be consistent on variable and granularity.
# The inputs may be in different directories.
# The output will be canonicalised to be independent of argument order.
# This is effectively a join on the datetime (first) field.
# All inputs must be well-formed, eg in date order.
# In the trivial case with one input file it is copied to stdout as-as.
#
# This can also be used to merge related data from different sources
# for comparison, eg h1d and h1b (Eddi divert and boost).
# The variable-match check would have to be disabled,
# and new source names may be needed for each entry, eg including the variable.
#
# Usage:
#     $0 [-het] input1.csv ... inputn.csv
#
#     -het  heterogeneous variables to be combined
#
# Exits with non-zero error code in case of error.
#
# See end of this file for more detail.

HETVARS="false"
if [ "-het" = "$1" ]; then
    HETVARS="true"
    shift
fi

if [ $# -lt 1 ]; then
    echo "ERROR: no input files provided" 1>&2
    exit 1
fi

# In the trivial case with one input file it is copied to stdout as-as.
if [ $# -eq 1 ]; then
    cat $1
    exit 0
fi

# Full set of inputs.
INPUTS="$*"

# Canonical (alpha) sorted list of devices/sources.
DEVICES=""

# Parse variable and granularity from first one.
VARIABLE="$(basename $2 .csv | awk -F- '{print $1}')"
GRANULARITY="$(basename $2 .csv | awk -F- '{print $2}')"
# Ensure that these match across all inputs.
for f in $INPUTS;
    do
    V="$(basename $f .csv | awk -F- '{print $1}')"
    G="$(basename $f .csv | awk -F- '{print $2}')"
    D="$(basename $f .csv | awk -F- '{print $3}')"
    if [ "$G" != "$GRANULARITY" ]; then
        echo "ERROR: mixed granularity values" 1>&2
        exit 1
    fi
    if [ "true" != "$HETVARS" -a "$V" != "$VARIABLE" ]; then
        echo "ERROR: mixed variable values" 1>&2
        exit 1
    fi
    if [ "true" != "$HETVARS" ]; then
        # Default case.
        DEVICES="$DEVICES $D"
    else
        # Qualify device name with variable with -het
        DEVICES="$DEVICES $D-$V"
    fi
    done

# Sort device names.
# This will also drop any duplicates.
DEVICESSORTED="$(echo $DEVICES | awk '{for(i=1;i<=NF;++i){print $i}}' | sort -uf)"

# Build in-order list of files.
INPUTSSORTED=""
for d in $DEVICESSORTED;
    do
    for f in $INPUTS
        do
        if [ "true" != "$HETVARS" ]; then
            # Default case.
            D="$(basename $f .csv | awk -F- '{print $3}')"
            if [ "$D" = "$d" ]; then INPUTSSORTED="$INPUTSSORTED $f"; fi
        else
            # Qualify device name with variable with -het
            V="$(basename $f .csv | awk -F- '{print $1}')"
            D="$(basename $f .csv | awk -F- '{print $3}')"
            if [ "$D-$V" = "$d" ]; then INPUTSSORTED="$INPUTSSORTED $f"; fi
        fi
        done
    done

# Make the header...
# Steal the date comment field from the first input for the correct format!
HEADER="$(awk -F, < $1 '{printf("%s",$1);exit}')"
for d in $DEVICESSORTED;
do
    HEADER="$HEADER,device,coverage,$VARIABLE"
done

# Create the header...
echo "$HEADER"
# List the input files in sorted/canonical order.
for f in $INPUTSSORTED;
do
    echo '#input,"'"$f"'"'
done


# O(n) merge.
# Pipe canonical ordered list of date keys into awk to be the anchor column.
cat $INPUTS | awk -F, '/^2/ {print $1}' | sort -u | \
    awk -v INPUTSSORTED="$INPUTSSORTED" '
        BEGIN {
        nFiles = split(INPUTSSORTED, filenames, " ");
        }
        {
        date=$1;
        printf("%s", date);
        for(i = 1; i <= nFiles; ++i) {
            printf(",");

            # Deal with any saved datum for this file.
            if("" != savedDate[i]) {
                if(savedDate[i] < date) {
                    # Too old now, ignore.
                    savedDate[i] = ""; # Purge saved.
                } else if(savedDate[i] > date) {
                    # Too soon, so avoid reading more from this file yet.
                    printf(",,"); continue;
                    }
                # Matches date!
                # Can use it and need not read from file.
                split(savedLine[i], cols, ",");
                printf("%s,%s,%s", cols[2], cols[3], cols[4]);
                savedDate[i] = ""; # Purge saved.
                continue;
                }
            
            # Read the next row from this file.
            # Usually expect to output date-matching row each time (final case).
            while(1) {
                if((getline line < filenames[i]) <= 0) {
                    # At EOF.
                    printf(",,");
                    break;
                    }

                # Skip comment line.
                if("#" == substr(line,1,1)) { continue; }
                split(line, cols, ",");
                rowDate = cols[1];
                # Skip over file row date older than key.
                if(rowDate < date) { continue; }

                # Save future data for later.
                if(rowDate > date) {
                    savedDate[i] = rowDate;
                    savedLine[i] = line;
                    printf(",,");
                    break;
                    }

                # When we have a date-matching row, add it to the output.
                printf("%s,%s,%s", cols[2], cols[3], cols[4]);
                break;
                }
            }
        print ""; # Terminate row/record.
        }
        END {
        # Close all the files, though not strictly necessary!
        for(i = 1; i <= nFiles; ++i) { close(filenames[i]); }
        }
        '

exit 0



# Merge data such as:

##YYYY-MM,device,coverage,imp,comment
#2009-10,meter,1,120.2
#2009-11,meter,1,135.7
# ...
#2022-01,meter,1,160.0
#2022-02,meter,1,74.3
#2022-03,meter,1,56.4
#2022-04,meter,1,18.5

# and

##YYYY-MM,device,coverage,imp,comment
# ...
#2022-01,Enphase,1,158.089
#2022-02,Enphase,1,72.18
#2022-03,Enphase,0.998656,52.66
#2022-04,Enphase,1,14.347
# ...

# and
#YYYY-MM,device,coverage,imp,comment
#2022-02,Eddi,0.21131,11.871
#2022-03,Eddi,1,78.915
#2022-04,Eddi,1,35.058

# into:

#YYYY-MM,device,coverage,imp,device,coverage,imp,device,coverage,imp
#2022-02,Eddi,0.21131,11.871,Enphase,1,72.18,meter,1,74.3
#2022-03,Eddi,1,78.915,Enphase,0.998656,52.66,meter,1,56.4
#2022-04,Eddi,1,35.058,Enphase,1,14.347,meter,1,18.5
