This patch makes crawl 0.4 download MS Office documents instead of images. See http://blogs.23.nu/c0re/stories/4342 --md@hudora.de diff -rubB crawl-0.4/crawl.c crawl-0.4-md/crawl.c --- crawl-0.4/crawl.c Sun May 18 03:26:41 2003 +++ crawl-0.4-md/crawl.c Tue Jul 20 19:39:13 2004 @@ -538,9 +538,20 @@ http_setuseragent(agent); http_register_dispatch("text/html", html_follower); + http_register_dispatch("text/xml", html_follower); + http_register_dispatch("application/xhtml+xml", html_follower); + http_register_dispatch("application/xml", html_follower); http_register_dispatch("image/", image_saver); http_register_dispatch("audio/", image_saver); http_register_dispatch("video/", image_saver); + http_register_dispatch("application/msexel", image_saver); + http_register_dispatch("application/mspowerpoint", image_saver); + http_register_dispatch("application/msword", image_saver); + http_register_dispatch("application/octet-stream", image_saver); + http_register_dispatch("application/vnd.ms-excel ", image_saver); + http_register_dispatch("application/vnd.ms-powerpoint", image_saver); + http_register_dispatch("application/vnd.ms-word", image_saver); + http_register_dispatch("application/xls", image_saver); /* XXX - bad cludge */ http_setcallback(http_movecb, http_moved); diff -rubB crawl-0.4/crawl.conf crawl-0.4-md/crawl.conf --- crawl-0.4/crawl.conf Fri Aug 23 03:04:45 2002 +++ crawl-0.4-md/crawl.conf Tue Jul 20 19:54:58 2004 @@ -4,26 +4,26 @@ # [General] # Directives that determine which URL is being followed -Url-Include=http://.*\.citi\.umich\.edu -Url-Exclude=([^a-z]ads\.|\.(ico|eps|ps|gz|c|h|tar|exe|doc|pdf|ppt|txt|diff)$) +Url-Include=http://.* +Url-Exclude=([^a-z]ads\.|\.(ico|eps|ps|gz|c|h|tar|exe|gif|pdf|png|jpg|jpeg|txt|diff)$) # Directives that determine which images are downloaded and where # they will be saved. -Img-Include=\.(jpg|jpeg) +Img-Include=\.(DOC|doc) Img-Exclude=thumbs\. Img-Directory=. # The maximum depth of the crawl, -1 means unlimited, be very # careful with this. -#Max-Depth=-1 +Max-Depth=10 # An external filter that takes URL on stdin, it must return either # 'n' or 'y' on stdout. #External-Filter=./external #Minimum and maximum length of media, unless overwritten below. -Min-Length=20000 -Max-Length=400000 +Min-Length=2000 +Max-Length=20000000 [HTTP] #Agent=Crawl/0.2