/* 
   sitecopy, for managing remote web sites. Stored state handling routines.
   Copyright (C) 1999-2000, Joe Orton <joe@manyfish.co.uk>
                                                                     
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.
  
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
  
   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

*/

#include "config.h"

#include <sys/stat.h>

#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif
#ifdef HAVE_STRING_H
#include <string.h>
#endif

#ifdef HAVE_LIMITS_H
#include <limits.h>
#endif

#include <ctype.h>
#include <errno.h>
#include <stdio.h>

#include <ne_xml.h>
#include <ne_dates.h>
#include <ne_alloc.h>
#include <ne_string.h>

#include "common.h"
#include "sitesi.h"

/* This is the string used in the old-style storage files to indicate
 * the line is a directory rather than a file */
#define DIRWORD "dir"
/* And this is used for links */
#define LINKWORD "link"

/* Use a version in the site state file: 
 * Bump the major number if a backwardly-incompatible change is made.
 */
#define SITE_STATE_FILE_VERSION "1.0"

#define CDATABUFSIZ 128
#define CDATASHRINK 256

static int site_read_stored_state_old(struct site *site, FILE *fp);
static int site_read_stored_state_new(struct site *site, FILE *fp);

static int check_context(void *ud, ne_xml_elmid parent, ne_xml_elmid child);

int site_read_stored_state(struct site *site)
{
    FILE *fp;
    char buffer[6];
    int ret;

    NE_DEBUG(DEBUG_FILES, "Reading info file: %s\n", site->infofile);
    fp = fopen(site->infofile, "r" FOPEN_BINARY_FLAGS);
    if (fp == NULL) {
	struct stat st;
	ret = stat(site->infofile, &st);
	if ((ret == 0) || (errno != ENOENT)) {
	    /* The file exists but could not be opened for reading...
	     * this is an error condition. */
	    NE_DEBUG(DEBUG_FILES, "Stat failed %s\n", strerror(errno));
	    return SITE_ERRORS;
	} else {
	    NE_DEBUG(DEBUG_FILES, "Info file doesn't exist.\n");
	    return SITE_FAILED;
	}
    }
    /* To work out whether the file is old-style or new-style,
     * we try to read in 5 bytes from the file. */
    if (fgets(buffer, 6, fp) == NULL) {
	/* Empty file... 'no' files remotely */
	NE_DEBUG(DEBUG_FILES, "Empty storage file.\n");
	return SITE_OK;
    }
    /* Put the file pointer so they can we the whole file */
    rewind(fp);
    if (strncasecmp(buffer, "<?xml", 5) == 0) {
	/* XML-format */
	NE_DEBUG(DEBUG_FILES, "New-style XML storage file.\n");
	ret = site_read_stored_state_new(site, fp);
    } else {
	/* Old-style */
	NE_DEBUG(DEBUG_FILES, "Old-style storage file.\n");
	ret = site_read_stored_state_old(site, fp);
    }
    fclose(fp);
    return ret;
}

#define EOL "\r\n"

/* Opens the storage file for writing */
FILE *site_open_storage_file(struct site *site) 
{
    if (site->storage_file == NULL) {
	site->storage_file = fopen(site->infofile, "w" FOPEN_BINARY_FLAGS);
    }
    return site->storage_file;
}

int site_close_storage_file(struct site *site)
{
    int ret = fclose(site->storage_file);
    site->storage_file = NULL;
    return ret;
}

/* Write out the stored state for the site. 
 * Returns 0 on success, non-zero on error. */
int site_write_stored_state(struct site *site) 
{
    struct site_file *current;
    FILE *fp = site_open_storage_file(site);

    if (fp == NULL) {
	return -1;
    }

    fprintf(fp, "<?xml version=\"1.0\"?>" EOL);
    /* TODO: DTD */
    fprintf(fp, "<sitestate version=\"" SITE_STATE_FILE_VERSION "\">" EOL);
    fprintf(fp, "<options>" EOL);
    fprintf(fp, " <saved-by package=\"" PACKAGE "\" version=\"" VERSION "\"/>" EOL);
    if (site->state_method == state_checksum) {
	/* For forwards-compatibility */
	fprintf(fp, " <checksum-algorithm><checksum-MD5/></checksum-algorithm>" EOL);
    }
    fprintf(fp, " <state-method><state-%s/></state-method>" EOL,
	     (site->state_method==state_checksum)?"checksum":"timesize");
    if (site->safemode) {
	fprintf(fp, " <safemode/>" EOL);
    }
    fprintf(fp, "</options>" EOL);
    fprintf(fp, "<items>" EOL);
    /* Now write out the items */
    for (current = site->files; current!=NULL; current = current->next) {
	unsigned const char *pnt;
	if (!current->stored.exists) continue;
	fprintf(fp, "<item>");
	fprintf(fp, "<type><type-%s/></type>",
		 (current->type==file_file)?"file":(
		     (current->type==file_dir)?"directory":"link"));
	fprintf(fp, "<filename>");
	/* Filenames can contain all sorts of cruft, so we write out
	 * XML character references; "&#x" <HEX><HEX> ";" */
	for (pnt = current->stored.filename; *pnt!='\0'; pnt++) {
	    /* Escape characters which need escaping... ignore
	     * ASCII characters commonly found in filenames. */
	    if (isalnum(*pnt) || *pnt == '/' || *pnt == '.' || 
		*pnt == '-' || *pnt == '_') { 
		fputc(*pnt, site->storage_file);
	    } else {
		/* TODO: write out a two-byte UTF-8 char here instead,
		 * it's more space-efficient */
		fprintf(fp, "&#x%c%c;", NE_HEX2ASC((*pnt&0xf0)>>4), 
			 NE_HEX2ASC(*pnt&0x0f));
	    }
	}
	fprintf(fp, "</filename>" EOL);
	switch (current->type) {
	case file_link:
	    fprintf(fp, "<linktarget>%s</linktarget>", 
		     current->stored.linktarget);
	    break;
	case file_file:
	    fprintf(fp, "<protection>%03o</protection>", 
		     current->stored.mode); /* three-digit octal */
	    fprintf(fp, "<size>%" NE_FMT_OFF_T "</size>", 
		    current->stored.size);
	    switch (site->state_method) {
	    case state_checksum: {
		char csum[33];
		ne_md5_to_ascii(current->stored.checksum, csum);
		fprintf(fp, "<checksum>%s</checksum>", csum);
	    } break;
	    case state_timesize:
		fprintf(fp, "<modtime>%ld</modtime>", current->stored.time);
		break;
	    }
	    fprintf(fp, "<ascii>%s</ascii>",
		     current->stored.ascii?"<true/>":"<false/>");
	    if (current->server.exists) {
		fprintf(fp, "<server-modtime>%ld</server-modtime>", 
			 current->server.time);
	    }
	    break;
	case file_dir:
	    /* nothing to do */
	    break;
	}
	fprintf(fp, "</item>" EOL);
    }
    fprintf(fp, "</items>" EOL);
    fprintf(fp, "</sitestate>" EOL);
    site->stored_state_method = site->state_method;
    return site_close_storage_file(site);
}

#undef EOL

/* neon ne_xml-based XML parsing */

#define ELM_BASE 500
#define SITE_ELM_sitestate (ELM_BASE + 1)
#define SITE_ELM_options (ELM_BASE + 2)
#define SITE_ELM_opt_saved_by (ELM_BASE + 3)
#define SITE_ELM_opt_checksum (ELM_BASE + 4)
#define SITE_ELM_opt_checksum_md5 (ELM_BASE + 5)
#define SITE_ELM_opt_state_method (ELM_BASE + 6)
#define SITE_ELM_opt_state_method_timesize (ELM_BASE + 7)
#define SITE_ELM_opt_state_method_checksum (ELM_BASE + 8)
#define SITE_ELM_items (ELM_BASE + 9)
#define SITE_ELM_item (ELM_BASE + 10)
#define SITE_ELM_type (ELM_BASE + 11)
#define SITE_ELM_type_file (ELM_BASE + 12)
#define SITE_ELM_type_directory (ELM_BASE + 13)
#define SITE_ELM_type_link (ELM_BASE + 14)
#define SITE_ELM_filename (ELM_BASE + 15)
#define SITE_ELM_size (ELM_BASE + 16)
#define SITE_ELM_modtime (ELM_BASE + 17)
#define SITE_ELM_ascii (ELM_BASE + 18)
#define SITE_ELM_linktarget (ELM_BASE + 19)
#define SITE_ELM_checksum (ELM_BASE + 20)
#define SITE_ELM_protection (ELM_BASE + 21)
#define SITE_ELM_server_modtime (ELM_BASE + 22)
#define SITE_ELM_true (ELM_BASE + 23)
#define SITE_ELM_false (ELM_BASE + 24)

static const struct ne_xml_elm ss_elms[] = {
    { "", "sitestate", SITE_ELM_sitestate, 0 },
    { "", "options", SITE_ELM_options, 0 },
    { "", "saved-by", SITE_ELM_opt_saved_by, NE_XML_CDATA },
    { "", "checksum-algorithm", SITE_ELM_opt_checksum, 0 },
    { "", "checksum-MD5", SITE_ELM_opt_checksum_md5, 0 },
    { "", "state-method", SITE_ELM_opt_state_method, 0 },
    { "", "state-timesize", SITE_ELM_opt_state_method_timesize, 0 },
    { "", "state-checksum", SITE_ELM_opt_state_method_checksum, 0 },
    { "", "items", SITE_ELM_items, 0 },
    { "", "item", SITE_ELM_item, 0 },
    { "", "type", SITE_ELM_type, 0 },
    { "", "type-file", SITE_ELM_type_file, 0 },
    { "", "type-directory", SITE_ELM_type_directory, 0 },
    { "", "type-link", SITE_ELM_type_link, 0 },
    { "", "filename", SITE_ELM_filename, NE_XML_CDATA | NE_XML_UTF8DECODE },
    { "", "size", SITE_ELM_size, NE_XML_CDATA },
    { "", "modtime", SITE_ELM_modtime, NE_XML_CDATA },
    { "", "ascii", SITE_ELM_ascii, 0 },
    { "", "linktarget", SITE_ELM_linktarget, NE_XML_CDATA },
    { "", "checksum", SITE_ELM_checksum, NE_XML_CDATA },
    { "", "protection", SITE_ELM_protection, NE_XML_CDATA },
    { "", "server-modtime", SITE_ELM_server_modtime, NE_XML_CDATA },
    { "", "true", SITE_ELM_true, 0 },
    { "", "false", SITE_ELM_false, 0 },
    { "", "", NE_ELM_unknown, NE_XML_COLLECT },
    { NULL },
};

struct site_xmldoc {
    struct site *site;
    /* What we've collected so far */
    enum file_type type;
    struct file_state stored;
    struct file_state server;
    unsigned int truth:2; /* 0: invalid, 1: true, 2: false */
};

static int check_context(void *ud, ne_xml_elmid parent, ne_xml_elmid child)
{
    switch (parent) {
    case NE_ELM_root:
	if (child == SITE_ELM_sitestate)
	    return 0;
	break;
    case SITE_ELM_sitestate:
	switch (child) {
	case SITE_ELM_options:
	case SITE_ELM_items:
	    return 0;
	default:
	    break;
	}
	break;
    case SITE_ELM_options:
	switch (child) {
	case SITE_ELM_opt_saved_by:
	case SITE_ELM_opt_checksum:
	case SITE_ELM_opt_state_method:
	case NE_ELM_unknown: /* and anything we don't understand */
	    return 0;
	default:
	    return -1;
	}
	break;
    case SITE_ELM_opt_checksum:
	if (child == SITE_ELM_opt_checksum_md5)
	    return 0;
	break;
    case SITE_ELM_opt_state_method:
	switch (child) {
	case SITE_ELM_opt_state_method_checksum:
	case SITE_ELM_opt_state_method_timesize:
	    return 0;
	default:
	    return -1;
	}
	break;
    case SITE_ELM_items:
	if (child == SITE_ELM_item)
	    return 0;
	break;
    case SITE_ELM_item:
	switch (child) {
	case SITE_ELM_type:
	case SITE_ELM_filename:
	case SITE_ELM_checksum:
	case SITE_ELM_linktarget:
	case SITE_ELM_protection:
	case SITE_ELM_size:
	case SITE_ELM_modtime:
	case NE_ELM_unknown:
	case SITE_ELM_server_modtime:
	case SITE_ELM_ascii:
	    return 0;
	    break;	    
	default:
	    break;
	}
	break;
    case SITE_ELM_type:
	switch (child) {
 	case SITE_ELM_type_file:
 	case SITE_ELM_type_directory:
 	case SITE_ELM_type_link:
	    return 0;
	default:
	    break;
	}
	break;
    case SITE_ELM_opt_saved_by:
	/* allow for putting stuff in here */
	if (child == NE_ELM_unknown) 
	    return 0;
	break;
    case NE_ELM_unknown:
	/* allow for extensions */
	if (child == NE_ELM_unknown)
	    return 0;
	break;
	/* Now, stuff we only allow cdata in, or nothing at all */
    case SITE_ELM_ascii:
	switch (child) {
	case SITE_ELM_true:
	case SITE_ELM_false:
	    return 0;
	default:
	    break;
	}
	break;
    }

    return -1;    
}

static int end_element(void *userdata, const struct ne_xml_elm *elm, const char *cdata) 
{
    struct site_xmldoc *doc = userdata;
    
    /* Dispatch Ajax */
    switch (elm->id) {
    case SITE_ELM_opt_state_method_timesize:
	doc->site->stored_state_method = state_timesize;
	break;
    case SITE_ELM_opt_state_method_checksum:
	doc->site->stored_state_method = state_checksum;
	break;
    case SITE_ELM_type_file:
	doc->type = file_file;
	break;
    case SITE_ELM_type_directory:
	doc->type = file_dir;
	break;
    case SITE_ELM_type_link:
	doc->type = file_link;
	break;
    case SITE_ELM_filename:
	doc->stored.filename = ne_strdup(cdata);
	break;
    case SITE_ELM_checksum:
	if (strlen(cdata) > 32) {
	    /* FIXME: error */
	    return -1;
	} else {
	    /* FIXME: validate */
	    ne_ascii_to_md5(cdata, doc->stored.checksum);
#ifdef DEBUGGING
	    {
		char tmp[33];
		ne_md5_to_ascii(doc->stored.checksum, tmp);
		NE_DEBUG(DEBUG_FILES, "Checksum recoded: [%32s]\n", tmp);
	    }
#endif /* DEBUGGING */
	}
	break;
    case SITE_ELM_size:
	doc->stored.size = strtol(cdata, NULL, 10);
	if (doc->stored.size == LONG_MAX)
	    return -1;
	break;
    case SITE_ELM_protection:
	doc->stored.mode = strtoul(cdata, NULL, 8);
	if (doc->stored.mode == ULONG_MAX)
	    return -1;
	break;
    case SITE_ELM_server_modtime:
	doc->server.time = strtol(cdata, NULL, 10);
	if (doc->server.time == LONG_MIN || doc->server.time == LONG_MAX)
	    return -1;
	doc->server.exists = true;
	break;
    case SITE_ELM_modtime:
	doc->stored.time = strtol(cdata, NULL, 10);
	if (doc->stored.time == LONG_MIN || doc->stored.time == LONG_MAX)
	    return -1;
	break;
    case SITE_ELM_true:
	doc->truth = 1;
	break;
    case SITE_ELM_false:
	doc->truth = 2;
	break;
    case SITE_ELM_ascii:
	if (doc->truth) {
	    doc->stored.ascii = (doc->truth==1);
	} else {
	    return -1;
	}
	break;
    case SITE_ELM_linktarget:
	doc->stored.linktarget = ne_strdup(cdata);
	break;
    case SITE_ELM_item: {
	struct site_file *file;
	/* Gordon's aliiiiiveeeeee... */
	doc->stored.exists = true;
	file = file_set_stored(doc->type, &doc->stored, doc->site);
	if (doc->server.exists) {
	    file_state_copy(&file->server, &doc->server, doc->site);
	}
	DEBUG_DUMP_FILE_PROPS(DEBUG_FILES, file, doc->site);
    }	break;
    default:
	break;
    }

    return 0;

}

/* Read a new XML-format state storage file */
static int site_read_stored_state_new(struct site *site, FILE *fp)
{
    ne_xml_parser *p;
    struct site_xmldoc doc = {0};
    int ret;
    
    doc.site = site;

    p = ne_xml_create();
    ne_xml_push_handler(p, ss_elms, check_context, NULL, end_element, &doc);
    
    ret = 0;
    do {
	char buffer[BUFSIZ];
	int len;	
	len = fread(buffer, 1, BUFSIZ, fp);
	if (len < BUFSIZ) {
	    if (feof(fp)) {
		ret = 1;
	    } else if (ferror(fp)) {
		ret = -1;
		/* And don't parse anything else... */
		break;
	    }
	}
	ne_xml_parse(p, buffer, len);
    } while (ret == 0 && ne_xml_valid(p));

    if (!ne_xml_valid(p)) {
	site->last_error = ne_strdup(ne_xml_get_error(p));
	ret = SITE_ERRORS;
    } else if (ret < 0) {
	site->last_error = ne_strdup(strerror(errno));
	ret = SITE_ERRORS;
    } else {
	ret = SITE_OK;
    }

    ne_xml_destroy(p);
    
    return ret;    
}

/* Read an old-format state storage file */
static int site_read_stored_state_old(struct site *site, FILE *fp)
{
    char buf[BUFSIZ], /* for the line */
	tmp[BUFSIZ], /* for the current field */
	*pos, /* for the current position within buf */
	*point; /* for the curpos within tmp */
    struct file_state stored_state;
#if 1
    /* gcc, be quiet */
    enum file_type type = file_new;
#else
    enum file_type type;
#endif
    int got_type, state;
    /* That's all we can do */
    site->stored_state_method = state_timesize;
    /* The file exists, so read it.
     * Format: one item / line, tab-separated fields.
     * First field is filename of item.
     * Second field is 'dir' for directory items, or mtime for files
     * Third field (files only) is size of file */
    while (fgets(buf, BUFSIZ, fp) != NULL) {
	/* Create a new file item and lob it on the end of the linked 
	 * list */
	/* Make sure we have an end-of-buffer */
	buf[BUFSIZ-1] = '\0';
	memset(&state, 0, sizeof(struct file_state));
	/* Now parse the line. Simple DFA, states are:
	 *  0: Reading filename, field 1
	 *  1: Reading date/time stamp or DIRWORD, field 2
	 *  2: Reading file size, field 3 (if file)
	 *  3: Junk state - consume-all-crud-till-end-of-line.
	 * 
	 * We read the current field into tmp char by char.
	 * point is the current point within tmp. */
	state = 0;
	got_type = false;
	point = tmp;
	for (pos = buf; *pos!='\0'; pos++) {
	    if (*pos < 0) state = 5;
	    switch (state) {
	    case 0:
		if (*pos == '\t') {
		    /* End of the filename */
		    *point = '\0';
		    /* +1 to skip the leading / */
		    stored_state.filename = ne_strdup(tmp + 1);
		    point = tmp;
		    state = 1;
		} else {
		    /* Still in the filename */
		    *(point++) = *pos;
		}
		break;
	    case 1:
		if (*pos == '\t' || *pos == '\n') {
		    /* End of the second field */
		    *point = '\0';
		    if (strlen(tmp) > 0) {
			if (strcmp(tmp, DIRWORD) == 0) {
			    /* It's a directory! */
			    type = file_dir;
			    state = 3; /* that's all we need */
			} else if (strcmp(tmp, LINKWORD) == 0) {
			    type = file_link;
			    point = tmp;
			    state = 4; /* read the link target */
			} else {
			    /* It's a file! - field 2 is the mtime */
			    type = file_file;
			    stored_state.time = atol(tmp);
			    point = tmp;
			    state = 2;
			}
			got_type = true;
		    } else {
			/* Corrupt line, we need at least two fields */
			/* FIXME: Report this to the user. */
			state = 5;
		    }			
		} else {
		    /* Within the second field */
		    *(point++) = *pos;
		}
		break;
	    case 2:
		if (*pos == '\n') {
		    /* End of the size field */
		    *point = '\0';
		    stored_state.size = atol(tmp);
		    state = 3;
		} else {
		    /* Within the file size field */
		    *(point++) = *pos;
		}
		break;
	    case 3: /* junk state */
		break;
	    case 4: /* read the link name */
		if (*pos == '\n') {
		    /* End of the field */
		    *point = '\0';
		    stored_state.linktarget = ne_strdup(tmp);
		    state = 3;
		} else {
		    *(point++) = *pos;
		}
	    case 5: /* error state */
		break;
	    }
	}
	if ((state == 5) || (stored_state.filename == NULL) || (!got_type)) {
	    /* FIXME: quit */
	    NE_DEBUG(DEBUG_FILES, "Corrupt line.\n");
	    return SITE_ERRORS;
	} else {
	    struct site_file *current;
	    stored_state.exists = true;
	    current = file_set_stored(type, &stored_state, site);
	    DEBUG_DUMP_FILE_PROPS(DEBUG_FILES, current, site);
	}
    }
    return SITE_OK;
}

