/*
 *   Copyright (C) 1997, 1998, 1999, 2000 Loic Dachary
 *
 *   This program is free software; you can redistribute it and/or modify it
 *   under the terms of the GNU General Public License as published by the
 *   Free Software Foundation; either version 2, or (at your option) any
 *   later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program; if not, write to the Free Software
 *   Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 
 *
 */
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif /* HAVE_CONFIG_H */

#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif /* HAVE_STDLIB_H */
#include <stdio.h>
#include <string.h>
#ifdef HAVE_DMALLOC_H
#include <dmalloc.h>
#endif /* HAVE_DMALLOC_H */

#include <khash.h>
#include <getopttools.h>
#include <uri.h>
#include <salloc.h>

#include <robots.h>
#include <robots_parser.h>
#include <mysql.h>
#include <sqlutil.h>
#include <dirsel.h>

static void robots_decode(robots_t* robots, MYSQL_RES *res, MYSQL_ROW row);
/*
 * Returns 0 if ok, -1 if should try again.
 */
static int robots_load_1(robots_t* robots, crawl_context_t* context, uri_t* url_object);
static void robots_save_entry(robots_t* robots);
static void robots_reset_entry(robots_t* robots);
static robots_t* params_alloc();

static int verbose = 0;

static struct option long_options[] =
{
  /* These options set a flag. */
  {"verbose_robots", 0, &verbose, 1},
  {0, 0, 0, ROBOTS_OPTIONS}
};

static struct option_help long_options_help[] =
{
  /* These options set a flag. */
  {"verbose_robots", "robots.txt loading and parsing related messages."},
  {"0", ""}
};

struct option* robots_options(struct option[])
{
  return long_options;
}

struct option_help* robots_help_options(struct option_help [])
{
  return long_options_help;
}

robots_t* robots_alloc(int argc, char** argv, struct option options[])
{
  robots_t* params = params_alloc();

  opterr = 0;
  optind = 0;
  while(1) {
    /* `getopt_long' stores the option index here. */
    int option_index = 0;
    int c;
    int found = 1;

    c = getopt_long_only(argc, argv, "-", options, &option_index);

    /* Detect the end of the options. */
    if (c == -1)
      break;
     
    switch (c)
      {
      case 0:
	/* If this option set a flag, do nothing else now. */
	
	if (options[option_index].flag != 0)
	  break;
	if(!strcmp(options[option_index].name, "")) {
	  break;
	}
	found = 0;
	break;
      default:
	found = 0;
	break;
      }
    if(found) {
      hash_alloc_insert(params->options, (char*)options[option_index].name, strdup(optarg ? optarg : " "));
    }
  }

  return params;
}

#define ROBOTS_NAME "robots.txt"
#define ROBOTS_NAME_LENGTH 10

int robots_p(char* path)
{
  int length = strlen(path);
  return length >= ROBOTS_NAME_LENGTH &&
    !strcmp(path + (length - ROBOTS_NAME_LENGTH), ROBOTS_NAME);
}

void robots_load(robots_t* robots, crawl_context_t* context, char* url)
{
  static uri_t* url_object = 0;
  char* netloc;
  int url_length = strlen(url);

  /*
   * Build an url object from url
   */
  if(!url_object) {
    url_object = uri_alloc(url, url_length);
  } else {
    uri_realloc(url_object, url, url_length);
  }

  if(!strncasecmp(uri_scheme(url_object), "http", 4)) {
    netloc = uri_netloc(url_object);

    if(!strcmp(netloc, robots->current.netloc)) {
      if(verbose) fprintf(stderr, "robots_load: reuse current %s\n", netloc);
    } else {
      if(verbose) fprintf(stderr, "robots_load: load %s\n", netloc);
      /*
       * If robots entry fails to be inserted, it means that between the
       * time we checked if the entry existed and the time of the insertion,
       * another process did the insertion for us. Just reexecute, the record
       * will be found and everyone will be happy.
       */
      if(robots_load_1(robots, context, url_object) < 0)
	robots_load_1(robots, context, url_object);
    }
  }
}

void robots_info(robots_t* robots, int robot_delay, uri_t* url_object, time_t* delay, int* delayed)
{
  time_t now = time(0);
  robots_entry_t* entry = &robots->current;

  if(robot_delay == 0) {
    *delayed = ROBOTS_NOT_DELAYED;
  } else {
    if(entry->next_crawl <= now) {
      entry->queue_crawl = entry->next_crawl = now + robot_delay;
      *delayed = ROBOTS_NOT_DELAYED;
    } else {
      *delay = entry->queue_crawl;
      entry->queue_crawl += robot_delay;
      *delayed = ROBOTS_DELAYED;
    }
  }

  if(verbose) fprintf(stderr, "robots_info: next_crawl = %d, queue_crawl = %d\n", entry->next_crawl, entry->queue_crawl);
  if(verbose) fprintf(stderr, "robots_info: for %s => delayed = %s, delay = %ld\n", uri_uri(url_object), (*delayed ? "yes" : "no"), *delay);
}

static int robots_load_1(robots_t* robots, crawl_context_t* context, uri_t* url_object)
{
  static char* query = 0;
  static int query_size = 0;
  char* netloc = strdup(uri_netloc(url_object));
  int netloc_length = strlen(netloc);
  webbase_t* base = robots->base;
  MYSQL_RES *res;
  MYSQL_ROW row;
  char* query_format = "select * from robots where netloc = '%s'";

  static_alloc(&query, &query_size, 128 + netloc_length);

  robots_save_entry(robots);

  sprintf(query, query_format, netloc);
  if(verbose) fprintf(stderr, "robots_load: query = %s\n", query);
  smysql_query(&base->mysql, query);
  res = smysql_store_result(&base->mysql);
  if(mysql_num_rows(res)) {
    if(mysql_num_rows(res) > 1) {
      fprintf(stderr, "robots_load: unexpected number of rows query = %s, rows = %ld (ignored)\n", query, (long)mysql_num_rows(res));
    }

    row = mysql_fetch_row(res);
    robots_decode(robots, res, row);
    if(verbose) fprintf(stderr, "robots_load: found entry next_crawl = %d\n", robots->current.next_crawl);
  } else {
    char* robots_txt = uri_robots(url_object);
    webbase_url_t* webbase_url;
    char* allow = "null";
    char* disallow = "null";

    mysql_free_result(res);

    robots_reset_entry(robots);

    mirror_collect((char*)context, 0, robots_txt, WEBBASE_URL_WALK_ROBOTS);
    webbase_url = mirror(context, robots_txt);

    if(webbase_url &&
       (webbase_url->w_info & WEBBASE_URL_INFO_CONTENT) &&
       webbase_url->w_content_length > 0) {
#ifdef WEBBASE_CONTENT_BASE
      if(webbase_url->content && webbase_url->content_length > 0) {
	robots_parse(webbase_url->content, webbase_url->content_length, &allow, &disallow);
      }

#else /* WEBBASE_CONTENT_BASE */
      char* path = webbase_url_file(webbase_url->w_rowid);
      robots_parse(path, &allow, &disallow);
#endif /* WEBBASE_CONTENT_BASE */
    }

    static_alloc(&query, &query_size, 128 + netloc_length + strlen(allow) + strlen(disallow));

    sprintf(query, "replace into robots values (0, '%s', %d, %d, %s, %s)",
	    netloc,
	    robots->current.next_crawl,
	    robots->current.queue_crawl,
	    allow,
	    disallow);
    if(verbose) fprintf(stderr, "robots_load: query = %s\n", query);
    if(mysql_query(&base->mysql, query)) {
      if(mysql_errno(&base->mysql) != ER_DUP_ENTRY) {
	fprintf(stderr, "%s: %s\n", query, mysql_error(&base->mysql));
	exit(1);
      } else {
	free(netloc);
	return -1;
      }
    }

    /*
     * Query the base to get the information just inserted. This
     * is not optimal but prevent duplicating code from robots_decode.
     */
    sprintf(query, query_format, netloc);
    smysql_query(&base->mysql, query);
    res = smysql_store_result(&base->mysql);
    if(mysql_num_rows(res) != 1) {
      if(verbose) fprintf(stderr, "robots_load: expected 1 row, query = %s, rows = %ld\n", query, (long)mysql_num_rows(res));
      exit(1);
    }
    row = mysql_fetch_row(res);
    robots_decode(robots, res, row);
  }

  {
    if(robots->current.disallow && robots->current.disallow[0] != '\0')
      dirsel_robots_disallow(netloc, robots->current.disallow, DIRSEL_LOAD);
    if(robots->current.allow && robots->current.allow[0] != '\0')
      dirsel_robots_allow(netloc, robots->current.allow, DIRSEL_LOAD);
  }

  mysql_free_result(res);
  free(netloc);

  return 0;
}

static void robots_reset_entry(robots_t* robots)
{
  robots_entry_t* entry = &robots->current;

  entry->rowid = 0;
  entry->info = 0;
  entry->netloc[0] = '\0';
  if(entry->allow)
    entry->allow[0] = '\0';
  if(entry->disallow)
    entry->disallow[0] = '\0';
  entry->next_crawl = 0;
  entry->queue_crawl = 0;
}

static void robots_save_entry(robots_t* robots)
{
  static char* query = 0;
  static int query_size = 0;
  robots_entry_t* entry = &robots->current;
  webbase_t* base = robots->base;

  static_alloc(&query, &query_size, 128 + strlen(entry->netloc));

  /*
   * First save the current modified entry, if it is in use
   */
  if(entry->info & ROBOTS_INFO_USED) {
    sprintf(query, "update robots set next_crawl = %d, queue_crawl = %d where netloc = '%s'",
	    entry->next_crawl,
	    entry->queue_crawl,
	    entry->netloc);
    if(verbose) fprintf(stderr, "robots_save_entry: query = %s\n", query);
    smysql_query(&base->mysql, query);
  }
}

static void robots_decode(robots_t* robots, MYSQL_RES *, MYSQL_ROW row)
{
  robots_entry_t* entry = &robots->current;

  robots_reset_entry(robots);

  entry->rowid = atoi(row[0]);
  strcpy(entry->netloc, row[1]);
  entry->next_crawl = atoi(row[2]);
  entry->queue_crawl = atoi(row[3]);
  if(row[4]) {
    static_alloc(&entry->allow, &entry->allow_size, strlen(row[4]) + 1);
    strcpy(entry->allow, row[4]);
  }
  if(row[5]) {
    static_alloc(&entry->disallow, &entry->disallow_size, strlen(row[5]) + 1);
    strcpy(entry->disallow, row[5]);
  }
  entry->info |= ROBOTS_INFO_USED;

  if(verbose) fprintf(stderr, "robots_decode: %s next = %d\n", entry->netloc, entry->next_crawl);
}

void robots_free(robots_t* params)
{
  robots_save_entry(params);
  if(params->current.allow) free(params->current.allow);
  if(params->current.disallow) free(params->current.disallow);
  _K(hash_free)(params->options);
  free(params);
}

static void hnode_free(hnode_t *node, void *)
{
  free(node->data);
  free(node);
}

static robots_t* params_alloc()
{
  robots_t* params = (robots_t*)smalloc(sizeof(robots_t));
  memset((char*)params, '\0', sizeof(robots_t));
  params->options = hash_create(33, 0, 0);
  hash_set_allocator(params->options, 0, hnode_free, 0);

  return params;
}
