#!/usr/bin/python '''A very simple gopherspace spider by drt. It writes all typte, host, port, selector information it finds to stdout. by drt - http://koeln.ccc.de/~drt/ ''' import Queue import gopherlib import time import sys import traceback # mimimum seconds pause between fetching two files from the same server hostpause = 30 faildhosts = {} visitedhosts = {} def spider(todo): retry = Queue.Queue() hosttimings = {} while not todo.empty(): while not todo.empty(): (selector, host, port) = todo.get() now = time.time() if hosttimings.has_key(host) and (hosttimings[host] + hostpause > now): retry.put((selector, host, port)) continue hosttimings[host] = time.time() try: print >>sys.stderr, host, port, repr(selector), sys.stderr.flush() filelist = gopherlib.get_directory(gopherlib.send_selector(selector, host, port)) print >>sys.stderr, len(filelist) sys.stderr.flush() key = host + ':' + str(port) if not visitedhosts.has_key(key): visitedhosts[key] = [selector] else: visitedhosts[key].append(selector) for x in filelist: (type, name, selector, host, port, foo) = x if type != 'i' and type != '3' and host != 'error.host': print '%s\t%s\t%s' % (selector, host, port) if type == '1': key = host + ':' + str(port) if not (visitedhosts.has_key(key) and selector in visitedhosts[key]): todo.put((selector, host, port)) except: print >>sys.stderr, 'failed' # traceback.print_exc(file=sys.stdout) print >>sys.stderr, "pausing to give servers a rest, queuesize:", retry.qsize() time.sleep(hostpause / 2) todo = retry retry = Queue.Queue() def main(): todo = Queue.Queue() todo.put(('1/', 'w8n.koeln.ccc.de', 70)) todo.put(('1/', 'gopher.floodgap.com', 70)) todo.put(('1/', 'gopher.nct.de', 70)) spider(todo) main()