/*
 *   Copyright (C) 1997, 1998
 *   	Free Software Foundation, Inc.
 *
 *   This program is free software; you can redistribute it and/or modify it
 *   under the terms of the GNU General Public License as published by the
 *   Free Software Foundation; either version 2, or (at your option) any
 *   later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program; if not, write to the Free Software
 *   Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 
 *
 */
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif /* HAVE_CONFIG_H */

#include <stdio.h>
#include <string.h>
#include <sys/types.h>

#include <salloc.h>
#include <logfile.h>

#include <getopttools.h>
#include <webbase.h>
#include <sqlutil.h>

#define MAX_OPTIONS 100
#define APPLICATION_OPTIONS		0x8000000

typedef struct crawler_params {
  webbase_t* base;
  int fake;
  int repair;
  char* log;
} crawler_params_t;

static crawler_params_t params;

static int verbose = 0;

static void init(int argc, char** argv);
static void checker();
static void finish();

int main(int argc, char** argv)
{
  webbase_t* base;

  init(argc, argv);

  base = params.base;

  checker(base);
  finish();
  return 0;
}

#define INDEX_INT  0x01
#define INDEX_CHAR 0x02

typedef struct repair {
  int index_type;
  int index;
  char* repair_query;
} repair_t;

static void repair(char* argp, MYSQL_RES* res, MYSQL_ROW row) {
  repair_t* arg = (repair_t*)argp;
  static char* query = 0;
  static int query_size = 0;

  if(row[arg->index] == 0) {
    fprintf(stderr, "unexpected NULL for index (repair = %s)\n", arg->repair_query);
  }
  static_alloc(&query, &query_size, strlen(arg->repair_query) + strlen(row[arg->index]) * 2);

  sprintf(query, arg->repair_query, (arg->index_type == INDEX_INT ? row[arg->index] : sql_quote_char_simple(row[arg->index])));
  
  printf("%s\n", query);

  if(!params.fake)
    smysql_query(&params.base->mysql, query);
}

static void checker_1(char* title, char* query, int index_type, int index, char* repair_query)
{
  webbase_t* base = params.base;

  if(sql_select(&base->mysql, title, query, 0, 0) && params.repair) {
    repair_t arg;
    arg.index_type = index_type;
    arg.index = index;
    arg.repair_query = repair_query;
    sql_select(&base->mysql, "", query, repair, (char*)&arg);
  }
}

static void checker()
{
  char query[10 * 1024];
  webbase_t* base = params.base;

  printf("================= consistency checks start for %s ==============\n", base->name);
  if(params.repair) {
    char* query = "delete from url where url like '% %'";
    printf("%s\n", query);
    if(!params.fake) smysql_query(&base->mysql, query);
  }

  /*
   * url_complete flags in url
   */
#define S(flag,field) \
  sprintf(query, "select url.url,url.complete_rowid from url,url_complete where url.complete_rowid = url_complete.rowid and url.info & 0x%x = 0 and url_complete." #field " is not null", flag); \
  checker_1("\nURLs without " #flag " but complete_url." #field " not null\n", query, INDEX_CHAR, 0, "delete from url where url = '%s'")
  S(WEBBASE_URL_INFO_BASE, base_url);
  S(WEBBASE_URL_INFO_RELATIVE, relative);
  S(WEBBASE_URL_INFO_ABSOLUTE, absolute);
  S(WEBBASE_URL_INFO_LOCATION, location);
#undef S
#define S(flag,field) \
  sprintf(query, "select url.url,url.complete_rowid from url,url_complete where url.complete_rowid = url_complete.rowid and url.info & 0x%x != 0 and url_complete." #field " is null", flag); \
  checker_1("\nURLs with " #flag " but complete_url." #field " is null\n", query, INDEX_CHAR, 0, "delete from url where url = '%s'")
  S(WEBBASE_URL_INFO_BASE, base_url);
  S(WEBBASE_URL_INFO_RELATIVE, relative);
  S(WEBBASE_URL_INFO_ABSOLUTE, absolute);
  S(WEBBASE_URL_INFO_LOCATION, location);

  sprintf(query, "select url,info from url where complete_rowid = 0 and info & 0x%x\n",
	  WEBBASE_URL_INFO_BASE |
	  WEBBASE_URL_INFO_RELATIVE |
	  WEBBASE_URL_INFO_ABSOLUTE |
	  WEBBASE_URL_INFO_LOCATION);
  checker_1("\nURLS which info field imply complete_rowid not null and it is null\n", query, INDEX_CHAR, 0, "delete from url where url = '%s'");

  /*
   * start2url
   */
  checker_1("\nURLs without a reference in start2url (table url):\n",
	    "select start2url.url,url.url from url left join start2url on url.rowid = start2url.url where start2url.url is null",
	    INDEX_CHAR,
	    1,
	    "delete from url where url = '%s'");

  checker_1("\nentries in start2url referencing non existent entry in url\n",
	    "select url.rowid,start2url.start,start2url.url from start2url left join url on start2url.url = url.rowid where url.rowid is null",
	    INDEX_INT,
	    2,
	    "delete from start2url where url = %s");

  checker_1("\nentries in start2url referencing non existent entry in start\n",
	    "select start.rowid,start2url.start,start2url.url from start2url left join start on start2url.start = start.rowid where start.rowid is null",
	    INDEX_INT,
	    1,
	    "delete from start2url where start = %s");

  sql_select(&base->mysql,
	     "\nentries in start2url with level < 0\n",
	     "select start,url from start2url where level < 0", 0, 0);

  /*
   * url and url_complete
   */
  checker_1("\nZombie URLs additional information (table url_complete):\n",
	    "select url.rowid,url_complete.rowid from url_complete left join url on url.complete_rowid = url_complete.rowid where url.rowid is null",
	    INDEX_INT,
	    1,
	    "delete from url_complete where rowid = %s");

  sprintf(query, "select url_complete.rowid,url.url,url.complete_rowid from url left join url_complete on url.complete_rowid = url_complete.rowid where url.info & 0x%x != 0 and url_complete.rowid is null", WEBBASE_URL_INFO_COMPLETE);
  sql_select(&base->mysql,
	     "\nURLs with COMPLETE flag but no record in complete\n",
	     query, 0, 0);

  printf("================= consistency checks end for %s ==============\n", base->name);
}

void finish()
{
  if(params.log) free(params.log);
  webbase_free(params.base);
  exit(0);
}

static void init(int argc, char** argv)
{
  static struct option long_options[MAX_OPTIONS + 1] =
  {
    /* These options set a flag. */
    {"verbose", 0, &verbose, 1},
    {"fake", 0, &params.fake, 1},
    {"log", 1, 0, 0},
    {"repair", 0, &params.repair, 1},
    {"verbose_sqlutil", 0, 0, 0},
    {0, MAX_OPTIONS, 0, APPLICATION_OPTIONS}
  };

  getopt_merge(long_options, webbase_options(long_options));

  optind = 0;
  while(1) {
    /* `getopt_long' stores the option index here. */
    int option_index = 0;
    int c;

    c = getopt_long_only(argc, argv, "", long_options, &option_index);

    /* Detect the end of the options. */
    if (c == -1)
      break;
     
    switch (c)
      {
      case 0:
	/* If this option set a flag, do nothing else now. */
	if (long_options[option_index].flag != 0)
	  break;
	if(!strcmp(long_options[option_index].name, "log")) {
	  params.log = strdup(optarg);
	} else if(!strcmp(long_options[option_index].name, "verbose_sqlutil")) {
	  sqlutil_verbose(1);
	} else if(!strcmp(long_options[option_index].name, "")) {
	}
	break;
      default:
	fprintf(stderr, "option parse error %c, 0x%x\n", c & 0xff, c);
	exit(1);
      }
  }

  params.base = webbase_alloc(argc, argv, long_options);

  if(params.log) logfile(params.log);

  if(verbose) getopt_dump(long_options);

  return;
}
