#!/bin/sh
#
#   Copyright (C) 1997, 1998
#   	Free Software Foundation, Inc.
#
#   This program is free software; you can redistribute it and/or modify it
#   under the terms of the GNU General Public License as published by the
#   Free Software Foundation; either version 2, or (at your option) any
#   later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   You should have received a copy of the GNU General Public License
#   along with this program; if not, write to the Free Software
#   Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 
#

. config
. params
. test_functions

ftp() {
init_sql
$crawler -accept 'text/*,application/*,video/*' -depth 100 -verbose_crawl -no_hook -robot_delay 0 -base $testdb -disallow 'CVS' $ftphost/
$consistentc -base $testdb

$mysql -e 'select url,content_type from url' $testdb
}

mime() {
init_sql

$crawler -depth 100 -verbose_crawl -no_hook -robot_delay 0 -base $testdb -disallow 'CVS' $ftphost/pdf/
$crawler -depth 100 -accept 'text/*,application/msword' -verbose_crawl -no_hook -robot_delay 0 -base $testdb -disallow 'CVS' $ftphost/msword/
$consistentc -base $testdb
}

noheuristics() {
init_sql
$crawler -no_hook -robot_delay 0 -base $testdb http://localhost:7400/
echo
$crawler -verbose_crawl -noheuristics -no_hook -robot_delay 0 -base $testdb http://localhost:7400/
}

modified() {
#
# 
#
init_sql
cp -p $testdir/htdocs/0201.html $testdir/htdocs/0201.html.back
touch 0320202098 $testdir/htdocs/0201.html
$crawler -no_hook -robot_delay 0 -base $testdb http://localhost:7400/0201.html
$mysql -e "update url set crawl = from_unixtime(0) where url = 'http://localhost:7400/0201.html'" $testdb
$crawler -verbose_crawl -no_hook -robot_delay 0 -base $testdb http://localhost:7400/0201.html
touch 0321202098 $testdir/htdocs/0201.html
$mysql -e "update url set crawl = from_unixtime(0) where url = 'http://localhost:7400/0201.html'" $testdb
$crawler -verbose_crawl -no_hook -robot_delay 0 -base $testdb http://localhost:7400/0201.html
rm $testdir/htdocs/0201.html 
mv $testdir/htdocs/0201.html.back $testdir/htdocs/0201.html
}

robots() {
init_sql
#
# Inhibit robot delay handling by setting robot_delay to 0
#
$crawler -no_hook -depth 5 -robot_delay 0 -base $testdb http://localhost:7400/
echo
#
# Common robot delay handling
#
$crawler -no_hook -depth 5 -robot_delay 3 -base $testdb http://localhost:7400/redir.html
echo
#
# Simultaneous load of two parts of the same web. They
# should alternate.
#
init_sql
$mysql -e "insert into start (url,depth,robot_delay) values ('http://localhost:7400/', 4, 5)" $testdb
$mysql -e "insert into start (url,depth,robot_delay) values ('http://localhost:7400/redir.html', 4, 5)" $testdb
$crawler -no_hook -home_pages -base $testdb
echo
#
# Simultaneous load of two identical webs, they must load
# exactly in parallel
#
init_sql
$mysql -e "insert into start (url,depth,robot_delay) values ('http://localhost:7400/', 4, 5)" $testdb
$mysql -e "insert into start (url,depth,robot_delay) values ('http://localhost:7401/', 4, 5)" $testdb
$crawler -no_hook -home_pages -base $testdb
echo
}

slow() {
init_sql
$crawler -robot_delay 0 -no_hook -base $testdb http://localhost:7400/nph-slow.cgi
$mysql -e 'select url,code,extract from url' $testdb
$consistentc -base $testdb
}

timeouts() {
init_sql
$crawler -robot_delay 0 -no_hook -base $testdb http://unlikely/
$crawler -robot_delay 0 -no_hook -base $testdb http://localhost:900/
$crawler -robot_delay 0 -no_hook -timeout 5 -base $testdb http://localhost:7400/nph-hang.cgi?foo=bar
$crawler -robot_delay 0 -no_hook -base $testdb http://localhost:7400/nph-hang.cgi
$mysql -e 'select url,code,extract from url' $testdb
$consistentc -base $testdb
}

consistent() {
#
# Testing consistency checks
#
init_sql
$mysql $testdb < $testdir/consistent.data
$consistentc -base $testdb
}

frame() {
init_sql
#
# URL containing a frameset with 2 pointers, one found one not found
#
$crawler -robot_delay 0 -no_hook -base $testdb -depth 10 http://localhost:7400/frame.html
$mysql -e "select url,info from url where url like '%frame.html'" $testdb
}

corrupted_links() {
init_sql
#
# URL containing corrupted href links, and link too long
#
$crawler -robot_delay 0 -no_hook -base $testdb -depth 20 http://localhost:7400/corrupted.html
}

disabled() {
init_sql
$mysql -e "insert into start (url,robot_delay) values ('http://localhost:7400/', 0)" $testdb
$mysql -e "insert into start (url,disabled) values ('http://localhost:7400/0202.html', 'yes')" $testdb
$crawler -base $testdb -home_pages -no_hook
}

comment() {
init_sql
#
# URL containing a standard comment
#
$crawler -robot_delay 0 -no_hook -base $testdb http://localhost:7400/comment.html
$mysql -e "select extract from url where url like '%comment%'" $testdb
echo
#
# URL containing an unterminated comment at the beginning
#
$crawler -robot_delay 0 -no_hook -base $testdb http://localhost:7400/comment1.html
$consistentc -base $testdb
}

where_start() {
init_sql
$mysql -e "insert into start (url,depth,robot_delay) values ('http://localhost:7400/', 3, 5)" $testdb
$mysql -e "insert into start (url,depth,robot_delay) values ('http://localhost:7400/0202.html', 3, 5)" $testdb
$mysql -e "insert into start (url,depth,robot_delay) values ('http://localhost:7400/0201.html', 3, 5)" $testdb
$crawler -base $testdb -no_hook -home_pages -where_start 'rowid in (2,3)'
$consistentc -base $testdb
}

manual() {
init_sql
echo "#"
echo "# Many hrefs in a single page"
echo "#"
$crawler -robot_delay 0 -no_hook -base $testdb -depth 10 http://localhost:7400/manyrefs.html
$mysql -e "select length(relative) from url_complete" $testdb
echo
init_sql
echo "#"
echo "# Restrict number of hrefs loaded "
echo "#"
$crawler -robot_delay 0 -no_hook -size_limit 10000 -base $testdb -depth 10 http://localhost:7400/manyrefs.html
$mysql -e "select length(relative) from url_complete" $testdb
echo
init_sql
echo "#"
echo "# Simple load of 10 urls"
echo "#"
$crawler -robot_delay 0 -no_hook -base $testdb -verbose_crawl -depth 10 http://localhost:7400/
echo
echo "#"
echo "# Reduce to 4 urls"
echo "#"
$crawler -robot_delay 0 -update -no_hook -base $testdb -verbose_crawl -depth 4 http://localhost:7400/
echo
echo "#"
echo "# Check that url is reloaded with the 'touch' flag, only if the file does"
echo "# not exists."
echo "#"
rm $WLROOT/http:/localhost:7400/0202.html.store
$crawler -robot_delay 0 -no_hook -base $testdb -verbose_crawl -touch http://localhost:7400/0202.html
$crawler -robot_delay 0 -no_hook -base $testdb -verbose_crawl -touch http://localhost:7400/0202.html
echo
echo "#"
echo "# Load an url whose content is big (>50k)"
echo "#"
$crawler -robot_delay 0 -no_hook -base $testdb -verbose_crawl http://localhost:7400/big.html
cksum < $WLROOT/http:/localhost:7400/big.html.store
echo
$consistentc -base $testdb
}

manual2() {
init_sql
echo "#"
echo "# Load from start"
echo "#"
$mysql -e "insert into start (url,depth,robot_delay) values ('http://localhost:7400/', 5, 0)" $testdb
$crawler -no_hook -home_pages -base $testdb -verbose_crawl 
echo "#"
echo "# Change disallow "
echo "#"
$crawler -no_hook -disallow '/0202.html' -update -base $testdb -verbose_crawl http://localhost:7400/
echo "#"
echo "# Change depth only "
echo "#"
$crawler -no_hook -update -depth 10 -base $testdb -verbose_crawl http://localhost:7400/
}

redirections() {
init_sql

echo "#"
echo "# Standard redirections"
echo "#"
$mysql -e "insert into start (url,depth,robot_delay) values ('http://localhost:7400/redir.html', 100, 0)" $testdb
$crawler -no_hook -home_pages -base $testdb -verbose_crawl # -verbose_webbase
$mysql -e "select * from locks" $testdb
$consistentc -base $testdb

init_sql
echo "#"
echo "# Location: field contains a relative URL pointing to itself"
echo "#"
$crawler -robot_delay 0 -depth 5 -no_hook -base $testdb -verbose_crawl http://localhost:7400/nph-location.cgi
$mysql -e "select * from locks" $testdb
}

unload() {
init_sql
echo "#"
echo "# Create two start points that overlap"
echo "#"
$crawler -depth 5 -robot_delay 0 -no_hook -base $testdb http://localhost:7400/
$crawler -depth 5 -robot_delay 0 -no_hook -base $testdb http://localhost:7400/0202.html
$crawler -depth 5 -update -robot_delay 0 -no_hook -base $testdb http://localhost:7400/0202.html
$crawler -unload -no_hook -base $testdb http://localhost:7400/
$crawler -unload -no_hook -base $testdb http://localhost:7400/
$consistentc -base $testdb
$mysql -e "select rowid,url from url" $testdb
$mysql -e "select start,url from start2url" $testdb
echo "#"
echo "# Unload all except start"
echo "#"
init_sql
$crawler -depth 5 -robot_delay 0 -no_hook -base $testdb http://localhost:7400/
$crawler -unload_keep_start -no_hook -base $testdb http://localhost:7400/
$mysql -e "select rowid,url from url" $testdb
$mysql -e "select start,url from start2url" $testdb
$mysql -e "select url from start" $testdb
}

crawl_logic2() {
echo "#"
echo "# Force update even if crawl interrupted"
echo "#"
init_sql
$crawler -depth 5 -robot_delay 0 -no_hook -base $testdb http://localhost:7400/
$mysql -e "update start set info = 'exploring'" $testdb
$crawler -depth 3 -robot_delay 0 -update -no_hook -base $testdb http://localhost:7400/
$consistentc -base $testdb
$mysql -e "select count(*) from url" $testdb
echo "#"
echo "# Hash at end of URL is not taken in account"
echo "#"
init_sql
$crawler -depth 50 -robot_delay 0 -no_hook -base $testdb http://localhost:7400/hash.html
$consistentc -base $testdb
echo "#"
echo "# Home pages loading when there is more than one home page, not on"
echo "# the same web."
echo "#"
init_sql
$mysql -e "insert into start (url,depth,robot_delay) values ('http://localhost:7400/', 5, 0)" $testdb
$mysql -e "insert into start (url,depth,robot_delay) values ('http://localhost:7401/', 5, 0)" $testdb
$crawler -robot_delay 0 -no_hook -home_pages -base $testdb
echo "#"
echo "# Update must handle gracefully the case where the first URL has"
echo "# disapeared"
echo "#"
init_sql
$crawler -depth 5 -robot_delay 0 -no_hook -base $testdb http://localhost:7400/
$mysql -e "delete from url where url = 'http://localhost:7400/'" $testdb
$crawler -no_hook -base $testdb -update http://localhost:7400/
$mysql -e "select url from url order by url" $testdb
$consistentc -base $testdb
#echo "#"
#echo "# Update must handle gracefully the case where an URL has disapeared"
#echo "#"
#init_sql
#$crawler -depth 5 -robot_delay 0 -no_hook -base $testdb http://localhost:7400/
#mysql -e "delete from url where url = 'http://localhost:7400/0202.html'" $testdb
#$crawler -no_hook -base $testdb -update http://localhost:7400/
#mysql -e "select url from url order by url" $testdb
#$consistentc -base $testdb
}

crawl_update() {
init_sql
echo "#"
echo "# Simple crawl for a start"
echo "#"
$crawler -no_hook -base $testdb -depth 4 -robot_delay 0 http://localhost:7400/
$crawler -no_hook -base $testdb -depth 4 -robot_delay 0 http://localhost:7401/
$mysql -e "update start set robot_delay = 3" $testdb
$mysql -e "update url set crawl = 0 where url = 'http://localhost:7400/' or url = 'http://localhost:7400/0201.html'" $testdb
$mysql -e "update url set crawl = 0 where url = 'http://localhost:7401/' or url = 'http://localhost:7401/0201.html'" $testdb
$crawler -no_hook -base $testdb -home_pages -update
}

crawl_logic() {
init_sql
echo "#"
echo "# Simple crawl for a start"
echo "#"
$mysql -e "insert into start (url,depth,robot_delay) values ('http://localhost:7400/', 3, 0)" $testdb
$crawler -no_hook -home_pages -base $testdb -verbose_crawl # -verbose_webbase
$consistentc -base $testdb
$mysql -e "select rowid,url from url order by rowid" $testdb

echo "#"
echo "# Recrawl a specific url only"
echo "#"
rm $WLROOT/http:/localhost:7400/.store
$crawler -no_hook -urls "url = 'http://localhost:7400/'"  -base $testdb -verbose_crawl # -verbose_webbase
$consistentc -base $testdb
( cd $WLROOT/http:/localhost:7400/ ; ls .store )

echo "#"
echo "# Increase depth and pretend that crawl was interrupted"
echo "#"
$mysql -e "update start set depth = 12, info = 'exploring'" $testdb
#
# New URLs will be found in start point because we pretend that it needs
# crawling. The other two URLs will be left untouched.
#
$mysql -e "update url set crawl = from_unixtime(0) where url = 'http://localhost:7400/'" $testdb
$mysql -e "update url set crawl = from_unixtime(0) where url = 'http://localhost:7400/0201.html'" $testdb
$crawler -no_hook -home_pages -base $testdb -verbose_crawl | sed -e 's/Since:.*/Since: SOME TIME/'
$consistentc -base $testdb
$mysql -e "select rowid,url from url order by rowid" $testdb

echo "#"
echo "# Remove URL containing references and update crawl"
echo "#"
mv $testdir/htdocs/0201.html $testdir/htdocs/0201.html.1
$mysql -e "update url set crawl = from_unixtime(0) where url = 'http://localhost:7400/0201.html'" $testdb
$crawler -no_hook -home_pages -base $testdb -verbose_crawl
mv $testdir/htdocs/0201.html.1 $testdir/htdocs/0201.html

$consistentc -base $testdb
$mysql -e "select rowid,url from url order by rowid" $testdb
}

dumpdata() {
init_sql
rm -f raw.*
$crawler -depth 200 -robot_delay 0 -no_hook -base $testdb http://localhost:7401/
$dumpdata -base $testdb -size_max 12
echo
cksum raw.*
$dumpdata -base $testdb -size_max 1
echo
cksum raw.*
rm raw.*
}

t_html2text() {
echo "#"
echo "# dump data from html files"
echo "#"
init_sql
rm -f raw.*
rm -fr $WLROOT/*
$crawler -depth 200 -robot_delay 0 -no_hook -base $testdb http://localhost:7401/
$html2text -size_max 12 -dir $WLROOT 
echo
cksum raw.*
$dumpdata -base $testdb -size_max 12
echo
cksum raw.*
rm raw.*
}

external() {
init_sql
$crawler -robot_delay 0 -no_hook -base $testdb http://localhost:7401/externals/0202.html
$crawler -robot_delay 0 -depth 30 -no_hook -base $testdb http://localhost:7400/externals/
echo
$crawler -externals -base $testdb -where_start "url not like '%externals%'"
echo
$crawler -externals -base $testdb -where_start "url like '%externals%'"
}

cookies() {
(
echo "#"
echo "# Load a cookie'd file and ignore cookie"
echo "#"
init_sql
$mysql -e "insert into start (url,depth,robot_delay,info) values ('http://localhost:7401/cookies/', 3, 0, 'nocookie')" $testdb
$crawler -verbose_cookies -no_hook -base $testdb http://localhost:7401/cookies/ 
$mysql -e "select info from start" $testdb
echo "#"
echo "# Cookie + redirection. Check with infinit.net since I don't know"
echo "# how to reproduce this case."
echo "#"
init_sql
$crawler -verbose_crawl -verbose_cookies -robot_delay 0 -depth 3 -no_hook -base $testdb http://www.infinit.net/ 2>&1 | sed -e 's/OpenMarketSI=[^;]*;/OpenMarketSI=TAG/g'
echo
echo "#"
echo "# Load a cookie "
echo "#"
init_sql
$crawler -verbose_cookies -robot_delay 0 -depth 3 -no_hook -base $testdb http://localhost:7401/cookies/ 
echo "#"
echo "# Artificialy change the cookie so that it is updated"
echo "#"
$mysql -e "update cookies set cookie_out = 'foobar'" $testdb
echo
$crawler -noheuristics -verbose_cookies -robot_delay 0 -depth 3 -no_hook -base $testdb http://localhost:7401/cookies/ 
echo
init_sql
echo "#"
echo "# Alternate cookie and non cookie"
echo "#"
$mysql -e "insert into start (url,depth,robot_delay) values ('http://localhost:7400/', 4, 3)" $testdb
$mysql -e "insert into start (url,depth,robot_delay) values ('http://localhost:7401/cookies/', 4, 3)" $testdb
$crawler -verbose_cookies -home_pages -no_hook -base $testdb
echo
init_sql
echo "#"
echo "# Load a cookie and reuse it on the URLs of the same server"
echo "#"
$crawler -verbose_cookies -robot_delay 0 -depth 5 -no_hook -base $testdb http://localhost:7401/cookies/ 
echo
echo "#"
echo "# Crawler restarts and find an existing cookie in the cookies"
echo "# database, reusing it."
echo "#"
$crawler -noheuristics -verbose_cookies -robot_delay 0 -depth 5 -no_hook -base $testdb http://localhost:7401/cookies/ 
) 2>&1 | sed -e 's/Apache=.*;*/Apache=.../g'
}

bigpacket() {
init_sql
$crawler -robot_delay 0 -no_hook -base $testdb http://localhost:7400/bigpacket.html
$consistentc -base $testdb
}

entities() {
init_sql
rm -f raw.*
$crawler -robot_delay 0 -no_hook -base $testdb http://localhost:7400/bad/iso8859-1.html
$dumpdata -base $testdb
cksum raw.*
rm raw.*
}

auth() {
init_sql
$crawler -robot_delay 0 -no_hook -base $testdb http://uu:pass@localhost:7400/protected/foo.html
$crawler -robot_delay 0 -no_hook -base $testdb http://localhost:7400/protected/bar.html
$mysql -e 'select code,url from url' $testdb
}

dirsel() {
init_sql
echo "#"
echo "# Check robots.txt permissions"
echo "#"
$crawler -no_hook -verbose_crawl -depth 10 -robot_delay 0 -base $testdb http://localhost:7400/robots.html
echo "#"
echo "# Set the allow and disallow"
echo "#"
$crawler -no_hook -disallow '/03 /0205.html' -allow '/0303 /0309' -robot_delay 0 -base $testdb http://localhost:7400/
echo "#"
echo "# Crawl using the previously set allow and disallow"
echo "#"
$crawler -no_hook -update -verbose_crawl -depth 100 -base $testdb http://localhost:7400/
}

proxy() {
init_sql
http_proxy=http://proxy.iway.fr:8080/ $crawler -no_hook -verbose_crawl -robot_delay 0 -base $testdb http://www.yahoo.com/
}

case_sensitive() {
init_sql
$crawler -no_hook -verbose_crawl -robot_delay 0 -depth 5 -base $testdb http://localhost:7400/case.html
$mysql -e "select rowid,url from url" $testdb
$crawler -no_hook -update -verbose_crawl -robot_delay 0 -depth 5 -base $testdb http://localhost:7400/case.html
}

style() {
init_sql
$crawler -no_hook -robot_delay 0 -base $testdb http://localhost:7400/style.html
$mysql -e "select extract from url" $testdb
}

content_type() {
init_sql
echo "#"
echo "# Allow content type to be followed by ; harmlessly"
echo "#"
$crawler -no_hook -robot_delay 0 -base $testdb http://localhost:7400/nph-content_type.cgi
$mysql -e "select url,content_type from url order by url" $testdb
}

locking() {
(
init_sql
echo "#"
echo "# Remove spurious entry in locks"
echo "#"
$mysql -e "insert into locks values ('1start', '70000')" $testdb
$crawler -verbose_webbase -robot_delay 0 -no_hook -base $testdb http://localhost:7400/
export ECILA_LOCK_WAIT ; ECILA_LOCK_WAIT=2
export ECILA_LOCK_MAX_LOOP ; ECILA_LOCK_MAX_LOOP=2
echo "#"
echo "# Abort if waits too long"
echo "#"
$mysql -e "insert into locks values ('1start', '$$')" $testdb
$crawler -verbose_webbase -robot_delay 0 -no_hook -base $testdb http://localhost:7400/

init_sql
export ECILA_LOCK_WAIT ; ECILA_LOCK_WAIT=10
export ECILA_LOCK_MAX_LOOP ; ECILA_LOCK_MAX_LOOP=3
$mysql -e "insert into start (url,depth,robot_delay) values ('http://localhost:7400/', 2, 20)" $testdb
echo "#"
echo "# Run a 2 crawlers on the same URL, the second should block until the "
echo "# first is finished. "
echo "#"
echo "# test hp_load"
echo "#"
$crawler -no_hook -verbose_webbase -base $testdb -home_pages &
sleep 3
$crawler -no_hook -base $testdb http://localhost:7400/
$mysql -e "select name from locks" $testdb
echo "#"
echo "# test hp_load_in_core"
echo "#"
$crawler -no_hook -verbose_webbase -base $testdb -unload_keep_start http://localhost:7400/
$mysql -e "delete from robots" $testdb
$crawler -no_hook -verbose_webbase -base $testdb http://localhost:7400/ &
sleep 3
$crawler -no_hook -verbose_webbase -base $testdb http://localhost:7400/
$mysql -e "select name from locks" $testdb
#mysql -e "select url,content_type from url order by url" $testdb
)
}

socks()
{
init_sql
socks5 -b localhost:1080 -d 3 -f -s > logs/socks5.log 2>&1 &
socks=$!
sleep 3
SOCKS_SERVER=localhost:1080 ; export SOCKS_SERVER
$crawler -no_hook -robot_delay 0 -base $testdb http://www.yahoo.com/
kill -15 $socks
cat logs/socks5.log
}

debug()
{
init_sql
exit 0
}

testing ${*:-"ftp mime noheuristics modified robots slow timeouts consistent frame corrupted_links disabled comment where_start manual manual2 redirections unload crawl_logic2 crawl_update crawl_logic dumpdata t_html2text external cookies bigpacket entities auth dirsel case_sensitive style content_type locking"}
