/*
 *   Copyright (C) 1997, 1998, 1999 Loic Dachary
 *
 *   This program is free software; you can redistribute it and/or modify it
 *   under the terms of the GNU General Public License as published by the
 *   Free Software Foundation; either version 2, or (at your option) any
 *   later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program; if not, write to the Free Software
 *   Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 
 *
 */
#ifndef _webbase_url_h
#define _webbase_url_h

#if TIME_WITH_SYS_TIME
# include <sys/time.h>
# include <time.h>
#else
# if HAVE_SYS_TIME_H
#  include <sys/time.h>
# else
#  include <time.h>
# endif
#endif

#include <md5str.h>

#define WEBBASE_URL_LENGTH		512
#define WEBBASE_CONTENT_TYPE_LENGTH	32
#define WEBBASE_LANGUAGE_LENGTH		10
#define WEBBASE_EXTRACT_LENGTH		255
#define WEBBASE_TITLE_LENGTH		255
#define WEBBASE_KEYWORDS_LENGTH		255
#define WEBBASE_DESCRIPTION_LENGTH	255

/*
 * info field values
 */
#define WEBBASE_URL_INFO_FRAME		0x00000001
#define WEBBASE_URL_INFO_COMPLETE	0x00000002
#define WEBBASE_URL_INFO_COOKIE		0x00000004
#define WEBBASE_URL_INFO_BASE		0x00000008
#define WEBBASE_URL_INFO_RELATIVE	0x00000010
#define WEBBASE_URL_INFO_ABSOLUTE	0x00000020
#define WEBBASE_URL_INFO_CONTENT	0x00000040
#define WEBBASE_URL_INFO_LOCATION	0x00000080
#define WEBBASE_URL_INFO_TIMEOUT	0x00000100
#define WEBBASE_URL_INFO_NOT_MODIFIED	0x00000200
#define WEBBASE_URL_INFO_NOT_FOUND	0x00000400
#define WEBBASE_URL_INFO_OK		0x00000800
#define WEBBASE_URL_INFO_ERROR		0x00001000
#define WEBBASE_URL_INFO_HTTP		0x00002000
#define WEBBASE_URL_INFO_FTP		0x00004000
#define WEBBASE_URL_INFO_NEWS		0x00008000
#define WEBBASE_URL_INFO_CUMULMD5	0x00010000
#define WEBBASE_URL_INFO_EXTRACT	0x00020000
#define WEBBASE_URL_INFO_TITLE		0x00040000
#define WEBBASE_URL_INFO_KEYWORDS	0x00080000
#define WEBBASE_URL_INFO_DESCRIPTION	0x00100000
#define WEBBASE_URL_INFO_READING	0x00200000
#define WEBBASE_URL_INFO_TRUNCATED	0x00400000
#define WEBBASE_URL_INFO_FTP_DIR	0x00800000

#define WEBBASE_URL_INFO_CODE_MASK	0x00001f80

#define WEBBASE_URL_INFO_LOAD_BODY(code) (code >= WEBBASE_URL_CODE_OK && \
					  code < WEBBASE_URL_CODE_MULTIPLE_CHOICES)

/*
 * code field values
 */
#define WEBBASE_URL_CODE_CONTINUE			100
#define WEBBASE_URL_CODE_SWITCHING_PROTOCOLS		101

#define WEBBASE_URL_CODE_OK				200
#define WEBBASE_URL_CODE_CREATED			201
#define WEBBASE_URL_CODE_ACCEPTED			202
#define WEBBASE_URL_CODE_NON_AUTHORITATIVE_INFORMATION	203
#define WEBBASE_URL_CODE_NO_CONTENT			204
#define WEBBASE_URL_CODE_RESET_CONTENT			205
#define WEBBASE_URL_CODE_PARTIAL_CONTENT		206

#define WEBBASE_URL_CODE_MULTIPLE_CHOICES		300
#define WEBBASE_URL_CODE_MOVED_PERMANENTLY		301
#define WEBBASE_URL_CODE_MOVED_TEMPORARILY		302
#define WEBBASE_URL_CODE_SEE_OTHER			303
#define WEBBASE_URL_CODE_NOT_MODIFIED			304
#define WEBBASE_URL_CODE_USE_PROXY			305

#define WEBBASE_URL_CODE_BAD_REQUEST			400
#define WEBBASE_URL_CODE_UNAUTHORIZED			401
#define WEBBASE_URL_CODE_PAYMENT_REQUIRED		402
#define WEBBASE_URL_CODE_FORBIDDEN			403
#define WEBBASE_URL_CODE_NOT_FOUND			404
#define WEBBASE_URL_CODE_METHOD_NOT_ALLOWED		405
#define WEBBASE_URL_CODE_NOT_ACCEPTABLE			406
#define WEBBASE_URL_CODE_PROXY_AUTHENTICATION_REQUIRED	407
#define WEBBASE_URL_CODE_REQUEST_TIMEOUT		408
#define WEBBASE_URL_CODE_CONFLICT			409
#define WEBBASE_URL_CODE_GONE				410
#define WEBBASE_URL_CODE_LENGTH_REQUIRED		411
#define WEBBASE_URL_CODE_PRECONDITION_FAILED		412
#define WEBBASE_URL_CODE_REQUEST_ENTITY_TOO_LARGE	413
#define WEBBASE_URL_CODE_REQUEST_URI_TOO_LARGE		414
#define WEBBASE_URL_CODE_UNSUPPORTED_MEDIA_TYPE		415

#define WEBBASE_URL_CODE_INTERNAL_SERVER_ERROR		500
#define WEBBASE_URL_CODE_NOT_IMPLEMENTED		501
#define WEBBASE_URL_CODE_BAD_GATEWAY			502
#define WEBBASE_URL_CODE_SERVICE_UNAVAILABLE		503
#define WEBBASE_URL_CODE_GATEWAY_TIMEOUT		504
#define WEBBASE_URL_CODE_HTTP_VERSION_NOT_SUPPORTED	505

#define WEBBASE_URL_CODE_CONNECTION_REFUSED		602
#define WEBBASE_URL_CODE_CONNECTION_TIMED_OUT		603	

#define WEBBASE_URL_CODE_MAX				610	

/*
 * code field values
 */
#define WEBBASE_URL_CODE_MASK	0x0fff

#define WEBBASE_URL_CODE(v)	((v) & WEBBASE_URL_CODE_MASK)

#define WEBBASE_TAG_SET(u, tag) (((webbase_url_common*)(u))->tags |= (1 << (tag)))
#define WEBBASE_TAG_GET(u, tag) (((webbase_url_common*)(u))->tags & (1 << (tag)))

typedef struct webbase_url_common {
  int rowid;
  unsigned int info;
  unsigned short code;
  time_t mtime;
  time_t mtime_error;
  unsigned short tags;
  char content_type[WEBBASE_CONTENT_TYPE_LENGTH + 1];
  unsigned int content_length;
  unsigned char md5[MD5_ASCII_SIZE];
  int complete_rowid;
  time_t crawl;
  int hookid;
  char language[WEBBASE_LANGUAGE_LENGTH + 1];
  char extract[WEBBASE_EXTRACT_LENGTH + 1];
  char title[WEBBASE_TITLE_LENGTH + 1];
} webbase_url_common_t;

/*
 * Default values
 */
#define WEBBASE_URL_START_DEFAULT_URL_MAX_SIZE		254
#define WEBBASE_URL_START_DEFAULT_SIZE_LIMIT      	100000
#define WEBBASE_URL_START_DEFAULT_MIN      		0
#define WEBBASE_URL_START_DEFAULT_DEPTH      		2
#define WEBBASE_URL_START_DEFAULT_LEVEL      		100000
#define WEBBASE_URL_START_DEFAULT_TIMEOUT      		60
#define WEBBASE_URL_START_DEFAULT_LOADED_DELAY     	7
#define WEBBASE_URL_START_DEFAULT_MODIFIED_DELAY     	14
#define WEBBASE_URL_START_DEFAULT_NOT_FOUND_DELAY     	30
#define WEBBASE_URL_START_DEFAULT_TIMEOUT_DELAY     	1
#define WEBBASE_URL_START_DEFAULT_ROBOT_DELAY     	60

/*
 * Info field values
 */
#define WEBBASE_URL_START_SLEEPY	0x00001
#define WEBBASE_URL_START_UNESCAPE	0x00002
#define WEBBASE_URL_START_AUTH		0x00004
#define WEBBASE_URL_START_NOCOOKIE	0x00008
#define WEBBASE_URL_START_ACCEPT	0x00010
#define WEBBASE_URL_START_HEURISTICS	0x00020
#define WEBBASE_URL_START_STICKY	0x00040
#define WEBBASE_URL_START_FILTER	0x00080
#define WEBBASE_URL_START_HOMEFREE	0x00100
#define WEBBASE_URL_START_VIRGIN	0x00200
#define WEBBASE_URL_START_EXPLORING	0x00400
#define WEBBASE_URL_START_EXPLORED	0x00800
#define WEBBASE_URL_START_UPDATING	0x01000
#define WEBBASE_URL_START_IN_CORE	0x02000
#define WEBBASE_URL_START_ALLOW		0x04000
#define WEBBASE_URL_START_DISALLOW	0x08000
#define WEBBASE_URL_START_HOOK_INFO	0x10000

#define WEBBASE_URL_START_STATE_MASK    0x1e00

#define WEBBASE_TIME_HOUR	(60 * 60)
#define WEBBASE_TIME_DAY	(WEBBASE_TIME_HOUR * 24)
#define WEBBASE_TIME_WEEK	(WEBBASE_TIME_DAY * 7)

/*
 * Symbolic names of document parts
 */
#define WEBBASE_URL_TAG_BODY		0x01
#define WEBBASE_URL_TAG_TITLE		0x02
#define WEBBASE_URL_TAG_KEY		0x03
#define WEBBASE_URL_TAG_DESCRIPTION	0x04

typedef struct webbase_url_start {
  char url[WEBBASE_URL_LENGTH + 1];
  unsigned char url_md5[MD5_ASCII_SIZE];
  int info;
  int url_max_size;
  int size_limit;
  int min;
  int depth;
  int level;
  short timeout;
  char loaded_delay;
  char modified_delay;
  char not_found_delay;
  char timeout_delay;
  char robot_delay;
  char* auth;
  int auth_length;
  char* accept;
  int accept_length;
  char* filter;
  int filter_length;
  char* allow;
  int allow_length;
  char* disallow;
  int disallow_length;
  char* hook_info;
  int hook_info_length;

  int count;
  int rowid;
  int delay;
} webbase_url_start_t;

typedef struct webbase_url_complete {
  webbase_url_common_t common;
  char keywords[WEBBASE_KEYWORDS_LENGTH + 1];
  char description[WEBBASE_DESCRIPTION_LENGTH + 1];
  int base_url_length;
  char* base_url;
  int relative_length;
  char* relative;
  int absolute_length;
  char* absolute;
  int location_length;
  char* location;
} webbase_url_complete_t;

#define w_url			url
#define w_url_length		url_length
#define w_url_md5		url_md5
#define w_rowid			data.common.rowid
#define w_info			data.common.info
#define w_code			data.common.code
#define w_mtime			data.common.mtime
#define w_mtime_error		data.common.mtime_error
#define w_tags			data.common.tags
#define w_content_type		data.common.content_type
#define w_content_length	data.common.content_length
#define w_document_size		data.common.document_size
#define w_md5			data.common.md5
#define w_complete_rowid	data.common.complete_rowid
#define w_crawl			data.common.crawl
#define w_hookid		data.common.hookid
#define w_language		data.common.language
#define w_extract		data.common.extract
#define w_title			data.common.title
#define w_keywords		data.keywords
#define w_description		data.description
#define w_base_url_length	data.base_url_length
#define w_base_url		data.base_url
#define w_relative_length	data.relative_length
#define w_relative		data.relative
#define w_absolute_length	data.absolute_length
#define w_absolute		data.absolute
#define w_location_length	data.location_length
#define w_location		data.location

typedef struct webbase_url {
  char* url;
  int url_length;
  unsigned char url_md5[MD5_ASCII_SIZE];
  char* path;
  webbase_url_complete_t data; 
} webbase_url_t;

webbase_url_t* webbase_url_object();
void webbase_url_free(webbase_url_t* webbase_url);
void webbase_copy_url(webbase_url_t* to, webbase_url_t* from);

#define WEBBASE_URL_RESET_PARTIAL	1
#define WEBBASE_URL_RESET_TOTAL		0
void webbase_url_reset(webbase_url_t* webbase_url, int flag);

#define WEBBASE_URL_WALK_ABSOLUTE	0x01
#define WEBBASE_URL_WALK_RELATIVE	0x02
#define WEBBASE_URL_WALK_ROBOTS		0x04
typedef int (*webbase_url_walk_href_callback_t)(char* args, webbase_url_t* base_url, char* url, int flag);
int webbase_url_walk_href(webbase_url_t* webbase_url, int flag, webbase_url_walk_href_callback_t func, char* args);

int webbase_url_start_ok(char* url, int url_length);
int webbase_url_robots_p(webbase_url_t* webbase_url);

void webbase_url_code_set(webbase_url_t* webbase_url, int code);
void webbase_url_content_type_set(webbase_url_t* webbase_url, char* content_type);
void webbase_url_mtime_set(webbase_url_t* webbase_url, char* mtime);
void webbase_url_location_set(webbase_url_t* webbase_url, char* location);
void webbase_url_content_length_fix(webbase_url_t* webbase_url, char* path);
void webbase_url_content_length_set(webbase_url_t* webbase_url, int content_length);
void webbase_url_hrefs_set(webbase_url_t* webbase_url, char* path, int size_limit);
void webbase_url_print(webbase_url_t* webbase_url);
void webbase_url_mtime_fix(webbase_url_t* webbase_url);

#endif /* _webbase_url_h */
