#!/usr/bin/mawk -We
# *********************************************************************
#  Written by and copyright Carlo Strozzi <carlos@linux.it>.
#
#  jointable: joins two NoSQL tables on a common field.
#  Copyright (C) 1998-2001 Carlo Strozzi <carlos@linux.it>
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
#  2001-01-03 Ported to NoSQL v3
#  2001-03-17 Automated workfile creation (sucks!)
#  2001-04-17 Added inline help
#  2001-08-17 Added stdio portability
#
#  $Id$
# *********************************************************************

BEGIN {
  NULL = ""; FS = OFS = "\t"

  # Join columns should default to non-existent names.
  c_names[1] = c_names[2] = "---"

  # Separate join(1) options from column names.

  while (ARGV[++i] != NULL) {
    # Turn long options into their short form.
    if (ARGV[i] == "--all") ARGV[i] = "-a"
    else if (ARGV[i] == "--column") ARGV[i] = "-j"
    else if (ARGV[i] == "--suppress-join-column") ARGV[i] = "-J"
    else if (ARGV[i] == "--debug") ARGV[i] = "-x"
    else if (ARGV[i] == "--tmpfile") ARGV[i] = "-t"
    else if (ARGV[i] == "--help") ARGV[i] = "-h"

    # Now process each option in turn.
    if (ARGV[i] == "-j") c_names[1] = c_names[2] = ARGV[++i]
    else if (ARGV[i] == "-J") no_jcol = 1
    else if (ARGV[i] == "-x") debug = 1
    else if (ARGV[i] == "-1") c_names[1] = ARGV[++i]
    else if (ARGV[i] == "-2") c_names[2] = ARGV[++i]
    else if (ARGV[i] == "-t") tmpfile = ARGV[++i]
    else if (ARGV[i] == "-h") {
       system("grep -v '^#' @NOSQLPATH@/nosql/help/jointable.txt")
       rc = 1
       exit(rc)
    }
    else if (sub( /^-a */, NULL, ARGV[i])) {
      if (ARGV[i] == NULL) join_args = join_args " -a " ARGV[++i]
      else join_args = join_args " -a " ARGV[i]
    }
    else if (ARGV[i] !~ /^-/ || ARGV[i] == "-") {
      j_tables = j_tables " " ARGV[i]
    }
    else join_args = join_args " " ARGV[i]
  }

  ARGC = 1					# Fix argv[]

  # Option "-t" is required.
  if (tmpfile == NULL) {
     tmpdir = ENVIRON["TMPDIR"]
     if (tmpdir == NULL) tmpdir = "/tmp"
     mktemp = "mktemp -q " tmpdir "/joinXXXXXX"
     mktemp | getline tmpfile
     if (tmpfile == NULL) {
     	print "jointable: mktemp(1) failed" > "@STDERR@"
     	exit(1)
     }
  }

  split(j_tables, tbl, " ")

  # Only one table may be on stdin.
  if (tbl[1] == "-" && tbl[2] == "-") exit(1)

  # Get column names from whichever table is _not_ on stdin.
  if (tbl[1] != "-") {
    getline < tbl[1]        # Column names
    hdr_1 = $0

    # Get column names and positions.
    # Make sure we pick the first occurrence of duplicated column
    # positions (it may happen after a join).

    while (++p <= NF) {
      if (P1[$p] == NULL) P1[$p] = p
      N1[p] = $p
    }

    j1 = P1[c_names[1]]

    # Default to 1st column on invalid column name.
    if (j1 == NULL) {
      c_names[1] = N1[1]
      j1 = P1[c_names[1]]
    }
    
    getline < tbl[1]        # Dasline

    # Now write the body of tbl[1] to tmpfile.

    while (getline < tbl[1] > 0) print > tmpfile
    close(tbl[1]); close(tmpfile)

    # Make sure the file is actually written before proceeding.
    fflush()

    # Set new table file name.
    tbl[1] = tmpfile
  }
  else {
    getline < tbl[2]        # Column names
    hdr_2 = OFS $0 OFS

    # Get column names and positions.
    # Make sure we pick the first occurrence of duplicated column
    # names (it may happen after a join).

    while (++p <= NF) {
      if (P2[$p] == NULL) P2[$p] = p
      N2[p] = $p
    }

    j2 = P2[c_names[2]]

    # Default to 1st column on invalid column name.
    if (j2 == NULL) {
      c_names[2] = N2[1]
      j2 = P2[c_names[2]]
    }
    
    getline < tbl[2]        # Dashline

    # Now write the body of tbl[2] to the temporary work-file.

    while (getline < tbl[2] > 0) print > tmpfile
    close(tbl[2]) ; close(tmpfile)

    # Set new table file name.
    tbl[2] = tmpfile
  }
}

#
# Main loop
#

# Get column names from whichever table _is_ on stdin.
NR == 1 {
  p = 0
  if (tbl[1] == "-") {
    hdr_1 = $0

    # Get column names and positions.
    # Make sure we pick the first occurrence of duplicated column
    # positions (it may happen after a join).

    while (++p <= NF) {
      if (P1[$p] == NULL) P1[$p] = p
      N1[p] = $p
    }

    j1 = P1[c_names[1]]

    # Default to 1st column on invalid column name.
    if (j1 == NULL) {
      c_names[1] = N1[1]
      j1 = P1[c_names[1]]
    }
  }
  else {
    hdr_2 = OFS $0 OFS

    # Get column names and positions.
    while (++p <= NF) { P2[$p] = p; N2[p] = $p }

    j2 = P2[c_names[2]]

    # Default to 1st column on invalid column name.
    if (j2 == NULL) {
      c_names[2] = N2[1]
      j2 = P2[c_names[2]]
    }
  }

  # Remove j2 from hdr_2.
  regexp = OFS c_names[2] OFS
  gsub(regexp, OFS, hdr_2)

  # Now print output header and dashline.
  hdr1_size = split(hdr_1, a)
  hdr_1 = NULL

  for (x = 1; x <= hdr1_size; x++) {
    if (a[x] == c_names[1]) first_field = a[x]
    else (hdr_1 = hdr_1 OFS a[x])
  }

  if (no_jcol) sub( /^\t/, "", hdr_1)
  else hdr_1 = first_field hdr_1

  out_rec = hdr_1 hdr_2
  sub(/\t*$/, NULL, out_rec)
  print out_rec; gsub(/[^\t]/, "-", out_rec); print out_rec

  # Build output field list for join(1).
  if (!no_jcol) field_list = " 1." P1[N1[j1]]

  while (N1[++c] != NULL) {
    if (N1[c] !="." && c != j1)
      field_list = field_list ",1." P1[N1[c]]
  }

  if (no_jcol) sub(/^,/, " ", field_list)
  c = 0
  while (N2[++c] != NULL) {
    if (N2[c] != "." && c != j2)
      field_list = field_list ",2." P2[N2[c]]
  }

  join_args = join_args " -o " field_list
  join_args = join_args " -1 " j1 " -2 " j2 " " tbl[1] " " tbl[2]
  join_cmd = "join -t \"\t\" " join_args

  if (debug) print join_cmd > "@STDERR@"

  # Make sure we print the header before calling join(1).
  fflush()

  # Let's save one concurrent process and a couple of msec.
  join_cmd = "exec " join_cmd
}

NR > 2 { print |join_cmd }

END {
  if (rc) exit(rc)
  rc = close(join_cmd)
  if (tmpdir != NULL) system("rm -f " tmpfile)
  exit(rc)				# Return join(1) exit status.
}

#
# End of program
#
